# CUI/SP-CENS. Disclosure Prohibited—T13 U.S.C.

# Analysis code for "Residential mobility and persistently depressed voting among disadvantaged adults in a large housing experiment"
# PNAS Publication R Code
# Authors: David Knight and Baobao Zhang
# Prepared by: Baobao Zhang
# Note: Some aspects of the code are redacted as part of the U.S. Census Bureau disclosure process

#################################################################
# Part 1: Create MTO PIK dataset for each state

# Load libraries
library("haven")
library("dplyr")

# Load in the MTO PVS dataset
mto <- read_sas("[redacted]/mto1994_2010.sas7bdat")
# Get the PIK
mto_pik <- mto$pik
# Check how many do not have a PIK
table(mto_pik == "")
# [redacted] subjects do not have a PIK
rm("mto")

# Load the L2 observation count dataset
l2_count <- read.csv("/projects/data/l2_obs_count.csv")

# Function to match the PIK 
match_pik_function <- function(state, load_n = 100000) {
  obs_state <- l2_count$obs[l2_count$state == state]
  # Number of chunks
  chunks_n <- floor(obs_state/load_n + 1)
  # The name of the folder
  pvs_state <- paste0("[redacted]/uchicago_l2_2020_", state,
                      "_pvs.sas7bdat")
  # Check the max number of observations is correct
  check_state <- read_sas(data_file=pvs_state, col_select = c("pik", "CARRA_UID"),
                          skip = obs_state - 1)
  # Output the results
  if (as.numeric(nrow(check_state)) == 1) {
    print(paste0("Max obs for ", state, " correct"))
  } else {
    print(paste0("Max obs for ", state, " incorrect"))
  }
  # Remove check_sate
  rm("check_state")
  # Create a state file
  state_file_name <- paste0("/projects/data/data_matched_pik_", state, ".csv")
  write.table(data.frame("pik", "CARRA_UID"), state_file_name,  
              sep=",", row.names = FALSE, 
              append = TRUE, col.names = FALSE)
  # Load in chunks and output the matched PIK and CARRA_UID
  for (i in 1:chunks_n) {
    # Print where it is
    print(paste0(state, ": chunk ", i, " out of ", chunks_n))
    # Load in the chunk
    l2 <- read_sas(data_file=pvs_state, col_select = c("pik", "CARRA_UID"),
                   skip = (i-1)*load_n, n_max = load_n)
    # Match the PIK
    matched_pik <- l2[l2$pik %in% mto_pik & l2$pik != "",]
    # Get the number of matches
    nrow_matched_inchunk <- nrow(matched_pik)
    if (nrow_matched_inchunk > 0) {
      # Write the matched PIK and CARRA_UID to a spreadsheet
      write.table(matched_pik, state_file_name, 
                  sep=",", row.names = FALSE, 
                  append = TRUE, col.names = FALSE)
      print(paste0("Number of matches: ", nrow_matched_inchunk))
    } else {
      print(paste0("Number of matches: ", nrow_matched_inchunk))
    }
    rm("l2")
  }
  print(paste0(state, ": matching PIK completed"))
}

# Loop through all the states
for (st in l2_count$state[1:51]) {
  match_pik_function(state=st)
}


#################################################################
# Part 2: Extract the L2 data for only those in the MTO datasets

# Load libraries
library("haven")
library("dplyr")
library("Hmisc")

# Function to extract the L2 data by state

extract_rsch_data <- function(state) {
  # Load in the MTO PVS dataset
  d <- read.csv(paste0("/projects/data/matched_pik_state/data_matched_pik_",
                       state, ".csv"))
  if (nrow(d) < 1) {
    return(print(paste0(state, ": 0 matched PIK")))
  }
  # Get the state
  rsch_state <- paste0("[redacted]/uchicago_l2_2020_", state,
                       "_rsch.sas7bdat")
  # Load in only the Id variable
  l2 <- read_sas(data_file=rsch_state,
                 col_select = DQB_SOURCE_ID)
  print(paste0(state, " : IDs loaded"))
  names(l2)[1] <- "num_id"
  # Change the names of the variables
  names(d) <- c("CARRA_UID", "pik")
  # Convert the linking variable to to numeric
  d$num_id <- as.numeric(gsub(pattern=paste0(state, "2020UCL2"), replacement="",
                              d$CARRA_UID))
  l2$num_id <- as.numeric(gsub(pattern=paste0("uchicago_l2_2020_", state), 
                               replacement="",
                               l2$num_id))
  # Get the matched rows
  matched_row <- which(l2$num_id %in% d$num_id & (l2$num_id != "" | 
                                         !is.na(l2$num_id)))
  # Remove the L2 ids
  rm("l2")
  
  # Create an research data frame of the size that we want
  l2_container <- read_sas(data_file=rsch_state, n_max=length(matched_row))
  
  for (i in 1:length(matched_row)) {
    # Load in one matched row at once into the container data frame
    l2_container[i,] <- read_sas(data_file=rsch_state, skip=matched_row[i]-1,
                                 n_max=1)
    print(paste0(state,": ", i, " out of ", length(matched_row), " rows"))
  }
  
  # Add in the PIK
  l2_container$num_id <- as.numeric(gsub(pattern=paste0("uchicago_l2_2020_", state), 
                                         replacement="",
                                         l2_container$DQB_SOURCE_ID))
  l2_container <- merge(x = l2_container, y = d[,c("pik", "num_id")], all.x = TRUE,
                        by = "num_id")
  
  # Save the container
  save(l2_container, file=paste0("/projects/data/matched_research_data/rsch_data_matched_", state, ".RData"))
  
  return(print(paste0(state, ": research file saved")))
}


# Load the L2 observation count dataset: the number of observations per state
l2_count <- read.csv("/projects/data/l2_obs_count.csv", stringsAsFactors = FALSE)
l2_count <- l2_count[order(l2_count$obs, decreasing=TRUE),]
# Sort into groups for the batch run in parallel
l2_count$group_num <- NA
l2_count$group_num[l2_count$state == "ca"] <- 1
l2_count$group_num[l2_count$state %in% c("tx", "va")] <- 2
l2_count$group_num[l2_count$state %in% c("fl", "nj")] <- 3
l2_count$group_num[l2_count$state %in% c("ny", "il", "nc") ] <- 4
l2_count$group_num[l2_count$state %in% c("pa", "oh", "mi")] <- 5


# Randomize the rest of the states into 10 batches
# set.seed(######) seed value redacted
l2_count$group_num[is.na(l2_count$group_num)] <- 
  sample(rep(6:10, 8), size=sum(is.na(l2_count$group_num)))

# Run each group [This was done in parallel in the actual analysis.]

# All the states that are not California 

# Group 2
for (st in l2_count$state[l2_count$group_num == 2]) {
  extract_rsch_data(state = st)
}

# Group 3
for (st in l2_count$state[l2_count$group_num == 3]) {
  extract_rsch_data(state = st)
}

# Group 4
for (st in l2_count$state[l2_count$group_num == 4]) {
  extract_rsch_data(state = st)
}

# Group 5
for (st in l2_count$state[l2_count$group_num == 5]) {
  extract_rsch_data(state = st)
}

# Group 6
for (st in l2_count$state[l2_count$group_num == 6]) {
  extract_rsch_data(state = st)
}

# Group 7
for (st in l2_count$state[l2_count$group_num == 7]) {
  extract_rsch_data(state = st)
}

# Group 8
for (st in l2_count$state[l2_count$group_num == 8]) {
  extract_rsch_data(state = st)
}

# Group 9
for (st in l2_count$state[l2_count$group_num == 9]) {
  extract_rsch_data(state = st)
}

# Group 10
for (st in l2_count$state[l2_count$group_num == 10]) {
  extract_rsch_data(state = st)
}

# Clean up the California Data in 10 chunks

# Load libraries
library("haven")
library("dplyr")
library("Hmisc")

chunk_num <- 1

# Function to write the data in 10 chunks

california_chunk <- function(chunk_num) {

	state <- "ca"
	n_max_rows_chunk <- [redacted chunk size]

	# Load in the MTO PVS dataset
	d <- read.csv(paste0("/projects/data/matched_pik_state/data_matched_pik_",
		             state, ".csv"))
	# Change the names of the variables
	names(d) <- c("CARRA_UID", "pik")
	# Convert the linking variable to to numeric
	d$num_id <- as.numeric(gsub(pattern=paste0(state, "2020UCL2"), replacement="",
		                    d$CARRA_UID))

	# Load in the matched rows
	load("/projects/programs/rsch_clean/matched_rows_ca.RData")


	rsch_state <- paste0("[redacted]/uchicago_l2_2020_", state,
		             "_rsch.sas7bdat")
	# Create an research data frame of the size that we want
	l2_container <- read_sas(data_file=rsch_state, n_max=n_max_rows_chunk)

	# Keep the chunk rows
	chunk_rows_start <- [redacted chunk size]*(chunk_num - 1) + 1
	chunk_row_end <- [redacted chunk size]*chunk_num
	matched_row <- matched_row[chunk_rows_start:chunk_row_end]

	for (i in 1:length(matched_row)) {
	  # Load in one matched row at once into the container data frame
	  l2_container[i,] <- read_sas(data_file=rsch_state, skip=matched_row[i]-1,
		                       n_max=1)
	  print(paste0("chunk: ", chunk_num, "; ", 
		       state,": ", i, " out of ", length(matched_row), " rows"))
	}

	# Add in the PIK
	l2_container$num_id <- as.numeric(gsub(pattern=paste0("uchicago_l2_2020_", state), 
		                               replacement="",
		                               l2_container$DQB_SOURCE_ID))
	l2_container <- merge(x = l2_container, y = d[,c("pik", "num_id")], all.x = TRUE,
		              by = "num_id")

	# Save the container
	save(l2_container, file=paste0("/projects/data/matched_research_data/rsch_data_matched_", state, "_chunk_", chunk_num, ".RData"))

}

# Run the function for the 10 chunks [This was run in parallel in the actual analysis.]

california_chunk(1)
california_chunk(2)
california_chunk(3)
california_chunk(4)
california_chunk(5)
california_chunk(6)
california_chunk(7)
california_chunk(8)
california_chunk(9)
california_chunk(10) 

# Combine matched voter registration files

# Set working directory
setwd("/projects/data/matched_research_data")

# Get all the file names
rsch_list <- list.files()

rsch_dat <- load(rsch_list[1])
# Append state by state
for (i in 2:length(rsch_list)) {
  load(rsch_list[i])
  print(nrow(l2_container))
  rsch_dat <- rbind(rsch_dat, l2_container)
}

# Save the L2 data
save(rsch_dat, file="/projects/data/analysis_data/l2_data.RData")


#######################################################################
# Part 3: Check for duplicated or missing PIK; clean up the L2 dataset

# Add in MTO ID

# Load additional libraries
library("lubridate")
library("estimatr")

# Load the L2 dataset
load("/projects/data/analysis_data/l2_data.RData")
# Check duplicates
rsch_dat_pik <- rsch_dat[rsch_dat$pik != "l2_container",]
rsch_dup_pik <- rsch_dat$pik[duplicated(rsch_dat$pik)]
dup_rsch_pik <- rsch_dat[rsch_dat$pik %in% rsch_dup_pik,]
ndup_rsch_pik <- rsch_dat[!rsch_dat$pik %in% rsch_dup_pik,]
dup_rsch_pik <- dup_rsch_pik[order(dup_rsch_pik$pik),]
# Create a dataset to check for combination of unique variables
check_unique <- dup_rsch_pik %>% group_by(pik) %>% dplyr::summarise(
  unique_zip = length(unique(RAddr_Zip)),
  unique_birthday = length(unique(Voters_BirthDate)),
  unique_gender = length(unique(Voters_Gender)),
  unique_registration = length(unique(Voters_OfficialRegDate)),
  unique_la_id = length(DQB_SOURCE_ID)
  )
# Variable for unique birthday/year + gender
check_unique$same_birthday_gender <- check_unique$unique_birthday == 1 & 
  check_unique$unique_gender == 1
# Variable for unique birthday/gender + zipcode
check_unique$same_birthday_gender_zip <- check_unique$unique_birthday == 1 & 
  check_unique$unique_gender == 1 & check_unique$unique_zip == 1
# Check how many PIKs don't have to same birthday/year + gender
table(check_unique$same_birthday_gender)
table(check_unique$same_birthday_gender_zip)
# Create data frame that needs to be compared against the MTO data
dup_rsch_pik_hc <- dup_rsch_pik[!dup_rsch_pik$pik %in%
                                  check_unique$pik[check_unique$same_birthday_gender],] 



# Load MTO datasets
# ID
mto_id <- read_sas("[redacted]/mto1994_2010.sas7bdat")
# Final roster
mto_r <- read_sas("[redacted]/mto_fin_roster.sas7bdat")
mto_r$f_svy_yob_imp
mto_r$f_svy_age2007_imp
# Check for missings and duplicates
table(mto_r$ra_group, useNA="always")
table(mto_r$f_z_dropdup_flag)
table(duplicated(mto_r$PPID))

# Check how many in the roster has ID
names(mto_id)[names(mto_id) == "ppid"] <- "PPID"
# Merge the id into the roster
mto_r <- merge(x = mto_r, y = mto_id, all.x = TRUE, id = "PPID")
# Change the PIK missing to ""
mto_r$pik[mto_r$pik == ""] <- NA
# Create a variable for missing PIK
mto_r$pik_missing <- is.na(mto_r$pik)
# Create a variable for 
mto_r$pik_duplicated <- duplicated(mto_r$pik) 
mto_r$pik_duplicated[mto_r$pik_missing] <- NA
table(mto_r$pik_duplicated, useNA="always")
# Create a spreadsheet for manual checking
mto_r_hc <- mto_r[mto_r$pik %in% dup_rsch_pik_hc$pik & !is.na(mto_r$pik),]

# Determine which ones to keep in dup_rsch_pik_hc
dup_rsch_pik_hc$keep <- FALSE
# Create a year of birth variable
dup_rsch_pik_hc$dob_year <- 
  year(parse_date_time(dup_rsch_pik_hc$Voters_BirthDate, orders="%m/%d/%Y"))
# For loop to get all the keeps
for (pik in mto_r_hc$pik) {
  l2_row <- which(dup_rsch_pik_hc$pik == pik)
  # get the year of birth and gender
  dob_year <- as.numeric(mto_r_hc$dob_year[mto_r_hc$pik == pik])
  gender <- mto_r_hc$sex[mto_r_hc$pik == pik]
  l2_row_keep <- l2_row[dup_rsch_pik_hc$dob_year[l2_row] == dob_year &
    dup_rsch_pik_hc$Voters_Gender[l2_row] == gender]
  dup_rsch_pik_hc$keep[l2_row_keep] <- TRUE
}

# Throw away the bad duplicates and combine the good duplicates
dup_rsch_pik$keep <- FALSE
dup_rsch_pik$keep[dup_rsch_pik$pik %in%
                    check_unique$pik[check_unique$same_birthday_gender]] <- TRUE
dup_rsch_pik$keep[dup_rsch_pik$DQB_SOURCE_ID %in%
                    dup_rsch_pik_hc$DQB_SOURCE_ID[dup_rsch_pik_hc$keep]] <- 
  TRUE
# Check what percentage we get to keep
table(dup_rsch_pik$keep)
# Combine the outcomes for the good duplicates
dup_rsch_pik_c <- dup_rsch_pik[dup_rsch_pik$keep,] %>% group_by(pik) %>% dplyr::summarize(
  l2_num_records = n(),
  Voters_Active_c = mean(Voters_Active == "A"),
  Ethnic_Desc_Afr_self_reported_c = mean(Ethnic_Desc == 
                                           "African or Af-Am Self Reported"),
  Ethnic_Desc_Afr_modeled_c = mean(Ethnic_Desc == "Likely Af-Am (Modeled)"),
  Ethnic_Desc_Hispanic_c = mean(Ethnic_Desc == "Hispanic"),
  General_2020_c = mean(General_2020 == "Y"),
  Primary_2020_c = mean(Primary_2020 == "Y"),
  OtherElection_2020_c = mean(OtherElection_2020 == "Y"),	
  AnyElection_2019_c = mean(AnyElection_2019 == "Y"),
  General_2018_c = mean(General_2018 == "Y"),	
  Primary_2018_c = mean(Primary_2018 == "Y"),
  OtherElection_2018_c = mean(OtherElection_2018 == "Y"),
  AnyElection_2017_c = mean(AnyElection_2017 == "Y"),
  General_2016_c = mean(General_2016 == "Y"),
  Primary_2016_c = mean(Primary_2016 == "Y"),
  OtherElection_2016_c = mean(OtherElection_2016 == "Y"),
  AnyElection_2015_c = mean(AnyElection_2015 == "Y"),
  General_2014_c = mean(General_2014 == "Y"),
  Primary_2014_c = mean(Primary_2014 == "Y"),
  OtherElection_2014_c = mean(OtherElection_2014 == "Y"),
  AnyElection_2013_c = mean(AnyElection_2013 == "Y"),
  General_2012_c = mean(General_2012 == "Y"),
  Primary_2012_c = mean(Primary_2012 == "Y"),
  OtherElection_2012_c = mean(OtherElection_2012 == "Y"),
  AnyElection_2011_c = mean(AnyElection_2011 == "Y"),
  General_2010_c = mean(General_2010 == "Y"),
  Primary_2010_c = mean(Primary_2010 == "Y"),
  OtherElection_2010_c = mean(OtherElection_2010 == "Y"),
  AnyElection_2009_c = mean(OtherElection_2010 == "Y"),
  General_2008_c = mean(General_2008 == "Y"),
  Primary_2008_c = mean(Primary_2008 == "Y"),
  OtherElection_2008_c = mean(OtherElection_2008 == "Y"),
  AnyElection_2007_c = mean(AnyElection_2007 == "Y"),
  General_2006_c = mean(General_2006 == "Y"),
  Primary_2006_c = mean(Primary_2006 == "Y"),	
  OtherElection_2006_c = mean(OtherElection_2006 == "Y"),
  AnyElection_2005_c = mean(AnyElection_2005 == "Y"),
  General_2004_c = mean(General_2004 == "Y"),
  Primary_2004_c = mean(Primary_2004 == "Y"),
  OtherElection_2004_c = mean(OtherElection_2004 == "Y"),
  AnyElection_2003_c = mean(AnyElection_2003 == "Y"),
  General_2002_c = mean(General_2002 == "Y"),
  Primary_2002_c = mean(Primary_2002 == "Y"),
  OtherElection_2002_c = mean(OtherElection_2002 == "Y"),
  AnyElection_2001_c = mean(AnyElection_2001 == "Y"),
  General_2000_c = mean(General_2000 == "Y"),
  Primary_2000_c = mean(Primary_2000 == "Y"),
  OtherElection_2000_c = mean(OtherElection_2000 == "Y")
  )
# Create data frame of binary outcomes
dup_rsch_pik_c_outcome_b <- as.data.frame(dup_rsch_pik_c[,7:ncol(dup_rsch_pik_c)] > 0)
names(dup_rsch_pik_c_outcome_b) <- gsub(pattern="_c", replacement="_b",
                                        names(dup_rsch_pik_c_outcome_b))
# Merge in with the other data
dup_rsch_pik_c <- data.frame(dup_rsch_pik_c, dup_rsch_pik_c_outcome_b)

# Clean up the non-duplicated data frame
ndup_rsch_pik_c <- ndup_rsch_pik %>% group_by(pik) %>% dplyr::summarize(
  l2_num_records = n(),
  Voters_Active_c = mean(Voters_Active == "A"),
  Ethnic_Desc_Afr_self_reported_c = mean(Ethnic_Desc == 
                                           "African or Af-Am Self Reported"),
  Ethnic_Desc_Afr_modeled_c = mean(Ethnic_Desc == "Likely Af-Am (Modeled)"),
  Ethnic_Desc_Hispanic_c = mean(Ethnic_Desc == "Hispanic"),
  General_2020_c = mean(General_2020 == "Y"),
  Primary_2020_c = mean(Primary_2020 == "Y"),
  OtherElection_2020_c = mean(OtherElection_2020 == "Y"),  
  AnyElection_2019_c = mean(AnyElection_2019 == "Y"),
  General_2018_c = mean(General_2018 == "Y"),	
  Primary_2018_c = mean(Primary_2018 == "Y"),
  OtherElection_2018_c = mean(OtherElection_2018 == "Y"),
  AnyElection_2017_c = mean(AnyElection_2017 == "Y"),
  General_2016_c = mean(General_2016 == "Y"),
  Primary_2016_c = mean(Primary_2016 == "Y"),
  OtherElection_2016_c = mean(OtherElection_2016 == "Y"),
  AnyElection_2015_c = mean(AnyElection_2015 == "Y"),
  General_2014_c = mean(General_2014 == "Y"),
  Primary_2014_c = mean(Primary_2014 == "Y"),
  OtherElection_2014_c = mean(OtherElection_2014 == "Y"),
  AnyElection_2013_c = mean(AnyElection_2013 == "Y"),
  General_2012_c = mean(General_2012 == "Y"),
  Primary_2012_c = mean(Primary_2012 == "Y"),
  OtherElection_2012_c = mean(OtherElection_2012 == "Y"),
  AnyElection_2011_c = mean(AnyElection_2011 == "Y"),
  General_2010_c = mean(General_2010 == "Y"),
  Primary_2010_c = mean(Primary_2010 == "Y"),
  OtherElection_2010_c = mean(OtherElection_2010 == "Y"),
  AnyElection_2009_c = mean(OtherElection_2010 == "Y"),
  General_2008_c = mean(General_2008 == "Y"),
  Primary_2008_c = mean(Primary_2008 == "Y"),
  OtherElection_2008_c = mean(OtherElection_2008 == "Y"),
  AnyElection_2007_c = mean(AnyElection_2007 == "Y"),
  General_2006_c = mean(General_2006 == "Y"),
  Primary_2006_c = mean(Primary_2006 == "Y"),	
  OtherElection_2006_c = mean(OtherElection_2006 == "Y"),
  AnyElection_2005_c = mean(AnyElection_2005 == "Y"),
  General_2004_c = mean(General_2004 == "Y"),
  Primary_2004_c = mean(Primary_2004 == "Y"),
  OtherElection_2004_c = mean(OtherElection_2004 == "Y"),
  AnyElection_2003_c = mean(AnyElection_2003 == "Y"),
  General_2002_c = mean(General_2002 == "Y"),
  Primary_2002_c = mean(Primary_2002 == "Y"),
  OtherElection_2002_c = mean(OtherElection_2002 == "Y"),
  AnyElection_2001_c = mean(AnyElection_2001 == "Y"),
  General_2000_c = mean(General_2000 == "Y"),
  Primary_2000_c = mean(Primary_2000 == "Y"),
  OtherElection_2000_c = mean(OtherElection_2000 == "Y")
)

# Create data frame of binary outcomes
ndup_rsch_pik_c_outcome_b <- as.data.frame(ndup_rsch_pik_c[,7:ncol(ndup_rsch_pik_c)] > 0)
names(ndup_rsch_pik_c_outcome_b) <- gsub(pattern="_c", replacement="_b",
                                        names(ndup_rsch_pik_c_outcome_b))
# Merge in with the other data
ndup_rsch_pik_c <- data.frame(ndup_rsch_pik_c, ndup_rsch_pik_c_outcome_b)

# Rbind the data frames
l2_clean <- rbind(ndup_rsch_pik_c, dup_rsch_pik_c)
save(l2_clean, file="/projects/data/analysis_data/l2_clean.RData")

#######################################################################
# Part 4: Regression analysis [All of the R code were previously submitted as supporting material for previous disclosures.] 

# Load libraries
library("haven")
library("dplyr")
library("lubridate")
library("estimatr")
library("emmeans")
library("xlsx")

# Analysis functions

# ITT

itt_function <- function(outcome, outcome_label,
                         covariates_baseline = FALSE,
                         no_site = FALSE, covariate_terms = NULL,
                         subgroup, subgroup_label, subgroup_file_label,
                         my_data = d, weights = d$f_wt_totcore98,
                         covariate_exclude = "ra_site1", covariate_type = "hud",
                         clusters, marginal_means = FALSE, include_N = TRUE,
                         sample_name, output_number,
                         explicit_sample_desc,
                         implicit_sample_desc,
                         vote_eligibility_n = NULL,
                         vote_eligibility_cluster = NULL) {
  ro_lh <- 
    if (covariates_baseline) {
      # Construct the data frame with all the necessary variables
      # Covariate center fix issue
      cov_formula <- as.formula(paste0("~", paste(covariate_terms, collapse="+"), "-1"))
      cov_mat <- model.matrix(object=cov_formula, data = my_data[subgroup,])
    } else {
      # Construct the data frame with all the necessary variables
      # Covariate center fix issue
      cov_mat <- model.matrix(object= ~ ra_site - 1, data = my_data[subgroup,])
    }
  for (col_num in 1:ncol(cov_mat)) {
    center_w <- weighted.mean(cov_mat[,col_num], w=weights[subgroup])
    cov_mat[,col_num] <- cov_mat[,col_num] - center_w
  }
  # Put together the data
  reg_data <- data.frame(outcome = my_data[subgroup,outcome],
                         model.matrix(object = ~ exp_group - 1, 
                                      data = my_data[subgroup,]),
                         cov_mat)
  # Rename the outcome 
  names(reg_data)[1] <- outcome
  # Get the variable names
  reg_dat_n <- names(reg_data)
  # Treatment terms
  treatment_terms <- paste0(reg_dat_n[3:4], collapse = " + ")
  if (no_site) {
    reg_formula <- as.formula(paste0(outcome, " ~ ", treatment_terms))
  } else {
    # Covariate terms
    covariate_names <- reg_dat_n[5:length(reg_dat_n)]
    if (!is.null(covariate_exclude)) {
      covariate_names <- covariate_names[covariate_names != covariate_exclude]
    }
    # Take out the covariates where all the values are 0
    covariates_zero <- colSums(reg_data[,covariate_names]) == 0
    covariates_terms <- paste0(covariate_names[!covariates_zero], collapse = " + ")
    # Interaction terms table
    reg_int_tab <- expand.grid(treatments = reg_dat_n[3:4], 
                               covariates = covariate_names[!covariates_zero])
    
    # Interaction terms
    int_terms <- paste0(paste0(reg_int_tab$treatments, ":", reg_int_tab$covariates), 
                        collapse = " + ")
    # Regression formula
    reg_formula <- as.formula(paste0(outcome, " ~ ", treatment_terms, " + ",
                                     covariates_terms, " + ",
                                     int_terms))
    
  }
  
  # Run the regression with the linear hypothesis
  ro_lh <- lh_robust(reg_formula, data = reg_data,
                     weights = weights[subgroup], 
                     cluster = clusters[subgroup],
                     linear_hypothesis = "exp_groupExp = exp_groupSec8")
  # Make a table of the main regression outcomes
  ro <- ro_lh$lm_robust
  
  # Make a table out of the regression results
  
  # Make the output file name
  covariate_label <- ifelse(covariates_baseline, paste0("cb_",
                                                        covariate_type), "nc")
  output_file_dir = "/projects/data/disclosure_tables/"
  
  # Output file name
  # File number
  file_number = paste0(output_number, ".", sample_name, ".ITT.", 
                       toupper(covariate_label))
  
  output_file_name = paste0("o_", file_number)
  
  # Disclosure sample file name
  disclosure_sample_file_name = paste0("ds_", file_number)
  
  
  # Put together the table
  ro_d <- data.frame(output_file_name = c(output_file_name, NA, NA), 
                     disclosure_sample_file_name = c(disclosure_sample_file_name, NA, NA),
                     sample_name = c(sample_name, NA, NA), 
                     outcome_variable = c(outcome, NA, NA),
                     outcome_label = c(outcome_label, NA, NA),
                     subgroup = c(subgroup_label, NA, NA),
                     baseline_covariates = c(covariates_baseline, NA, NA),
                     variable_name = c("Control mean", "Exp", "Sec8"),
                     coefficients = signif(ro$coefficients[1:3], 4), 
                     standard_errors = signif(ro$std.error[1:3], 4),
                     p_values = signif(ro$p.value[1:3], 4),
                     analysis_type = "ITT estimates", row.names=NULL)
  
  # If including the number of respondents
  if (include_N) {
    ro_d <- data.frame(ro_d,
                       N = c(count_round(ro$N), NA, NA),
                       N_clusters = c(count_round(ro$N_clusters), NA, NA))
  }
  
  # Linear hypothesis outputs
  lh_d <- data.frame(variable_name = row.names(ro_lh$lh),
                     coefficients = signif(ro_lh$lh$coefficients, 4),
                     standard_errors = signif(ro_lh$lh$std.error, 4),
                     p_values = signif(ro_lh$lh$p.value, 4),
                     analysis_type = "Linear hypothesis: from ITT regression")
  # Combine with the regression analysis output
  ro_d <- plyr::rbind.fill(ro_d, lh_d)
  
  # Marginal means
  if (marginal_means) {
    # Make a factor version of the dataset
    reg_data <- data.frame(reg_data,
                           exp_group_factor = my_data$exp_group[subgroup])
    ro_formula <-
      as.formula(paste0(outcome, "~exp_group_factor + ra_site2 + ra_site3 +
                        ra_site4 + ra_site5 + exp_group_factor:ra_site2 + 
                        exp_group_factor:ra_site3 + exp_group_factor:ra_site4 + 
                        exp_group_factor:ra_site5"))
    # Run the regression with the factor experimental groups
    ro_contrast <- lm_robust(ro_formula,
                             data = reg_data,
                             weights = weights[subgroup], 
                             cluster = clusters[subgroup])
    # Get the marginal means by group
    mm_results <- emmeans(ro_contrast, ~ exp_group_factor, type = "response")
    mm_results <- data.frame(outcome, outcome_label, as.data.frame(mm_results))
    # Output the results
    mm_d <- data.frame(variable_name = mm_results$exp_group_factor,
                       coefficients = signif(mm_results$emmean, 4),
                       standard_errors = signif(mm_results$SE, 4),
                       analysis_type = "Marginal means: from ITT regression")
    # Combine with the other outputs
    ro_d <- plyr::rbind.fill(ro_d, mm_d)
  }
  
  # Make the disclosure statistics
  
  # Explicit description 
  explicit_desc <- 
    if (!is.null(vote_eligibility_n)) {
      paste0("Explicit sample: ", 
             explicit_sample_desc, "; eligible to vote (18 years of age or older)")
    } else {
      paste0("Explicit sample: ", 
             explicit_sample_desc)
    }  
  
  # Overall counts
  explicit_n <- data.frame(ro_d[1, c("output_file_name", 
                                     "disclosure_sample_file_name",
                                     "outcome_variable")],
                           sample = explicit_desc,
                           participant_n = nrow(reg_data),
                           cluster_n = length(unique(clusters[subgroup])))
  
  # Implicit sample
  implicit_n <- data.frame(sample = paste0("Implicit sample: ",
                                           implicit_sample_desc),
                           participant_n = nrow(my_data[!subgroup,]),
                           cluster_n = length(unique(clusters[!subgroup])))
  
  # Additional implicit sample: eligible to vote
  if (!is.null(vote_eligibility_n)) {
    vote_eligibility_info <- data.frame(sample = paste0("Implicit sample:", explicit_sample_desc,
                                                        "; but not eligible to vote (less than 18 years of age)"),
                                        participant_n = vote_eligibility_n,
                                        cluster_n = vote_eligibility_cluster)
  }
  
  # By experimental group
  exp_group_n <- my_data[subgroup,] %>% 
    group_by(experimental_group = exp_group) %>% dplyr::summarize(
      participant_n = n(),
      cluster_n = length(unique(FAMID))
    )
  exp_group_n <- data.frame(sample = explicit_desc,
                            exp_group_n)
  
  # Combine the descriptive statistics about the sample
  desc_stat <- 
    if (!is.null(vote_eligibility_n)) {
      plyr::rbind.fill(explicit_n, vote_eligibility_info, implicit_n, exp_group_n)
    } else {
      plyr::rbind.fill(explicit_n, implicit_n, exp_group_n)
    }
  
  # By experimental group and outcome if outcome is binary
  if (length(unique(my_data[,outcome])) == 2) {
    my_data$outcome_variable <- my_data[,outcome]
    outcome_exp_group_n <- my_data[subgroup,] %>% 
      group_by(outcome_variable,                     
               experimental_group = exp_group) %>% 
      summarize(
        participant_n = n(),
        cluster_n = length(unique(FAMID))
      )
    names(outcome_exp_group_n)[names(outcome_exp_group_n) == "outcome_variable"] <-
      "outcome_variable_value"
    outcome_exp_group_n <- data.frame(sample = explicit_desc,
                                      outcome_exp_group_n)
    
    desc_stat <- plyr::rbind.fill(desc_stat, outcome_exp_group_n)
  }
  
  # Output the regression results to a csv
  
  # Output file
  write.xlsx(x = ro_d, row.names = FALSE, append = TRUE, showNA = FALSE,
             file = paste0(output_file_dir, "output_tables/output_disclosure_", today_date, ".xlsx"), 
             sheetName = output_file_name)
  
  # Disclosure sample file
  write.xlsx(x = desc_stat, row.names = FALSE, append = TRUE, showNA = FALSE,
             file = paste0(output_file_dir, "output_tables/output_disclosure_sample_", today_date, ".xlsx"), 
             sheetName = disclosure_sample_file_name)
  
  # Make file description 
  file_description <- paste0("Intention-to-treat estimates of the experimental and Section 8 vouchers on outcome; Outcome: ",
                             outcome_label, "; Age group at time of randomization: ", subgroup_label,
                             "; Regression includes pre-treatment covariates: ",
                             ifelse(covariates_baseline, "Yes", "No"),
                             "; Includes marginal means: ",
                             ifelse(marginal_means, "Yes", "No"))
  
  # Make memo file 
  memo_output <- paste0("FILE NUMBER: ", file_number, "\n",
                        "FILE NAME: output_files.ods, worksheet: ", output_file_name, "\n",
                        "FILE DESCRIPTION: ", file_description , "\n",
                        "RESEARCH OUTPUT PROGRAM: ", "data_analysis_disclosure.R", "\n",
                        "RESEARCH SAMPLE NUMBER: ", sample_name, "\n",
                        "DISCLOSURE STATISTICS FILE NAME: disclosure_sample_file.ods, worksheet: ", disclosure_sample_file_name, "\n",
                        "DISCLOSURE STATISTICS PROGRAM: ", "data_analysis_disclosure.R", "\n",
                        "COMMENT: ", "NA", "\n"
  )
  # Write memo to output directory
  write(memo_output, append = TRUE,
        file = paste0(output_file_dir, "memo/memo_information_", today_date, ".txt"))
  
}


# Function to estimate the treatment on the treated
iv_function <- function(outcome, outcome_label,
                        covariates_baseline = FALSE,
                        no_site = FALSE, covariate_terms = NULL,
                        subgroup, subgroup_label, subgroup_file_label,
                        my_data = d, weights = d$f_wt_totcore98,
                        covariate_exclude = "ra_site1", covariate_type = "hud",
                        clusters, include_N = TRUE,
                        sample_name, output_number, marginal_means = FALSE,
                        interaction_cov = TRUE,
                        explicit_sample_desc,
                        implicit_sample_desc,
                        vote_eligibility_n = NULL,
                        vote_eligibility_cluster = NULL) {
  
  ro_lh <- 
    if (covariates_baseline) {
      # Construct the data frame with all the necessary variables
      # Covariate center fix issue
      cov_formula <- as.formula(paste0("~", paste(covariate_terms, collapse="+"), "-1"))
      cov_mat <- model.matrix(object=cov_formula, data = my_data[subgroup,])
    } else {
      # Construct the data frame with all the necessary variables
      # Covariate center fix issue
      cov_mat <- model.matrix(object=~ra_site - 1, data = my_data[subgroup,])
    }
  for (col_num in 1:ncol(cov_mat)) {
    center_w <- weighted.mean(cov_mat[,col_num], w=weights[subgroup])
    cov_mat[,col_num] <- cov_mat[,col_num] - center_w
  }
  # Remove covariate columns where the data are all 0
  cov_mat <- cov_mat[,colSums(cov_mat) != 0]
  # Put together the data
  treatment_vars <- c("cmove_exp", "cmove_sec8",
                      "exp_group_Exp", "exp_group_Sec8")
  reg_data <- data.frame(outcome = my_data[subgroup, outcome],
                         my_data[subgroup, treatment_vars],
                         cov_mat)
  # Rename the outcome 
  names(reg_data)[1] <- outcome
  # Get the variable names
  reg_dat_n <- names(reg_data)
  # Make the regression formula
  reg_formula <- 
    if (no_site) {
      paste0(outcome, " ~ cmove_exp + cmove_sec8 | exp_group_Exp + exp_group_Sec8")
    } else {
      # Covariate terms
      covariate_names <- reg_dat_n[!reg_dat_n %in% c(treatment_vars, outcome)]
      if (!is.null(covariate_exclude)) {
        covariate_names <- covariate_names[covariate_names != covariate_exclude]
      }
      covariates_terms <- paste0(covariate_names, collapse = " + ")
      # Make the regression formula
      if (interaction_cov) {
        # Interaction terms table
        reg_int_tab_left <- expand.grid(treatments = c("cmove_exp", "cmove_sec8"), 
                                        covariates = covariate_names)
        reg_int_tab_right <- expand.grid(treatments = c("exp_group_Exp", "exp_group_Sec8"), 
                                         covariates = covariate_names)
        
        # Interaction terms
        int_terms_left <- paste0(paste0(reg_int_tab_left$treatments, ":", 
                                        reg_int_tab_left$covariates), 
                                 collapse = " + ")
        int_terms_right <- paste0(paste0(reg_int_tab_right$treatments, ":", 
                                         reg_int_tab_right$covariates), 
                                  collapse = " + ")
        paste0(outcome, " ~ cmove_exp + cmove_sec8 + ",
               covariates_terms, " + ",
               int_terms_left, " | ",
               "exp_group_Exp + exp_group_Sec8 +",
               covariates_terms, " + ",
               int_terms_right)
      } else {
        paste0(outcome, " ~ cmove_exp + cmove_sec8 + ",
               covariates_terms, " | ",
               "exp_group_Exp + exp_group_Sec8 +",
               covariates_terms)
      }
      
    }
  
  # Run the regression
  ro <- iv_robust(as.formula(reg_formula), data = reg_data,
                  weights = weights[subgroup], cluster = clusters[subgroup], 
                  se_type = "stata")
  
  # Make a table out of the regression results
  
  # Make the output file name
  covariate_label <- ifelse(covariates_baseline, paste0("cb_",
                                                        covariate_type), "nc")
  output_file_dir = "/projects/data/disclosure_tables/"
  
  # Output file name
  # File number
  file_number = paste0(output_number, ".", sample_name, ".TOT.", 
                       toupper(covariate_label))
  
  output_file_name = paste0("o_", file_number)
  
  # Disclosure sample file name
  disclosure_sample_file_name = paste0("ds_", file_number)
  
  
  # Put together the table
  ro_d <- data.frame(output_file_name = c(output_file_name, NA, NA), 
                     disclosure_sample_file_name = c(disclosure_sample_file_name, NA, NA),
                     sample_name = c(sample_name, NA, NA), 
                     outcome_variable = c(outcome, NA, NA),
                     outcome_label = c(outcome_label, NA, NA),
                     subgroup = c(subgroup_label, NA, NA),
                     baseline_covariates = c(covariates_baseline, NA, NA),
                     variable_name = c("Control mean", "Exp", "Sec8"),
                     coefficients = signif(ro$coefficients[1:3], 4), 
                     standard_errors = signif(ro$std.error[1:3], 4), 
                     p_values = signif(ro$p.value[1:3], 4), 
                     analysis_type = "TOT estimates", row.names=NULL)
  
  # If including the number of respondents
  if (include_N) {
    ro_d <- data.frame(ro_d,
                       N = c(count_round(ro$N), NA, NA),
                       N_clusters = c(count_round(ro$N_clusters), NA, NA))
  }
  
  
  # Make the disclosure statistics
  
  # Explicit description 
  explicit_desc <- 
    if (!is.null(vote_eligibility_n)) {
      paste0("Explicit sample: ", 
             explicit_sample_desc, "; eligible to vote (18 years of age or older)")
    } else {
      paste0("Explicit sample: ", 
             explicit_sample_desc)
    }  
  
  # Overall counts
  explicit_n <- data.frame(ro_d[1, c("output_file_name", 
                                     "disclosure_sample_file_name",
                                     "outcome_variable")],
                           sample = explicit_desc,
                           participant_n = nrow(reg_data),
                           cluster_n = length(unique(clusters[subgroup])))
  
  # Implicit sample
  implicit_n <- data.frame(sample = paste0("Implicit sample: ",
                                           implicit_sample_desc),
                           participant_n = nrow(my_data[!subgroup,]),
                           cluster_n = length(unique(clusters[!subgroup])))
  
  # Additional implicit sample: eligible to vote
  if (!is.null(vote_eligibility_n)) {
    vote_eligibility_info <- data.frame(sample = paste0("Implicit sample:", explicit_sample_desc,
                                                        "; but not eligible to vote (less than 18 years of age)"),
                                        participant_n = vote_eligibility_n,
                                        cluster_n = vote_eligibility_cluster)
  }
  
  # By experimental group
  exp_group_n <- my_data[subgroup,] %>% 
    group_by(experimental_group_move = exp_group_move) %>% dplyr::summarize(
      participant_n = n(),
      cluster_n = length(unique(FAMID))
    )
  exp_group_n <- data.frame(sample = explicit_desc,
                            exp_group_n)
  
  # Combine the descriptive statistics about the sample
  desc_stat <- 
    if (!is.null(vote_eligibility_n)) {
      plyr::rbind.fill(explicit_n, vote_eligibility_info, implicit_n, exp_group_n)
    } else {
      plyr::rbind.fill(explicit_n, implicit_n, exp_group_n)
    }
  
  # By experimental group and outcome if outcome is binary
  if (length(unique(my_data[,outcome])) == 2) {
    my_data$outcome_variable <- my_data[,outcome]
    outcome_exp_group_n <- my_data[subgroup,] %>% 
      group_by(outcome_variable,                     
               experimental_group_move = exp_group_move) %>% 
      dplyr::summarize(
        participant_n = n(),
        cluster_n = length(unique(FAMID))
      )
    names(outcome_exp_group_n)[names(outcome_exp_group_n) == "outcome_variable"] <-
      "outcome_variable_value"
    outcome_exp_group_n <- data.frame(sample = explicit_desc,
                                      outcome_exp_group_n)
    
    desc_stat <- plyr::rbind.fill(desc_stat, outcome_exp_group_n)
  }
  
  # Output the regression results to a csv
  
  # Output file
  write.xlsx(x = ro_d, row.names = FALSE, append = TRUE, showNA = FALSE, 
             file = paste0(output_file_dir, "output_tables/output_disclosure_", today_date, ".xlsx"), 
             sheetName = output_file_name)
  
  # Disclosure sample file
  write.xlsx(x = desc_stat, row.names = FALSE, append = TRUE, showNA = FALSE,
             file = paste0(output_file_dir, "output_tables/output_disclosure_sample_", today_date, ".xlsx"), 
             sheetName = disclosure_sample_file_name)
  
  # Make file description 
  file_description <- paste0("Treatment-on-the-treated estimates of the experimental and Section 8 vouchers on outcome; Outcome: ",
                             outcome_label, "; Age group at time of randomization: ", subgroup_label,
                             "; Regression includes pre-treatment covariates: ",
                             ifelse(covariates_baseline, "Yes", "No"),
                             "; Includes marginal means: ",
                             ifelse(marginal_means, "Yes", "No"))
  
  # Make memo file 
  memo_output <- paste0("FILE NUMBER: ", file_number, "\n",
                        "FILE NAME: output_files.ods, worksheet: ", output_file_name, "\n",
                        "FILE DESCRIPTION: ", file_description , "\n",
                        "RESEARCH OUTPUT PROGRAM: ", "data_analysis_disclosure_1.R", "\n",
                        "RESEARCH SAMPLE NUMBER: ", sample_name, "\n",
                        "DISCLOSURE STATISTICS FILE NAME: disclosure_sample_file.ods, worksheet:", disclosure_sample_file_name, "\n",
                        "DISCLOSURE STATISTICS PROGRAM: ", "data_analysis_disclosure_1.R", "\n",
                        "COMMENT: ", "NA", "\n"
  )
  
  # Write memo to output directory
  write(memo_output, append = TRUE,
        file = paste0(output_file_dir, "memo/memo_information_", today_date, ".txt"))
  
}

# Load in datasets
# Load MTO datasets
# ID dataset
mto_id <- read_sas("[redacted]/mto1994_2010.sas7bdat")
# Make a variable for duplicated or missing PIK
mto_id$pik_missing_dup <- mto_id$pik == "" | duplicated(mto_id$pik)
# Remove the duplicated or missing PIK
mto_id <- mto_id[!mto_id$pik_missing_dup,]

# Load final roster dataset
mto_r <- read_sas("[redacted]/mto_fin_roster.sas7bdat")
# Remove the ones that have weight of 0 because they are not part of the final roster
mto_r <- mto_r[mto_r$f_wt_totcore98 != 0,]
names(mto_id)[names(mto_id) == "ppid"] <- "PPID"
# Merge the id into the roster
mto <- merge(x = mto_r, y = mto_id, all.x = TRUE, id = "PPID")
# Make a variable for duplicated or missing PIK
mto$pik_missing_dup <- is.na(mto$pik)
mto$pik_missing_dup[duplicated(mto$pik) & !is.na(mto$pik)] <- TRUE

# Make the treatment assignments and site into factors
mto$exp_group <- NA
mto$exp_group[mto$ra_group == 1] <- "Exp"
mto$exp_group[mto$ra_group == 2] <- "Sec8"
mto$exp_group[mto$ra_group == 3] <- "Control"
mto$exp_group <- factor(mto$exp_group, levels=c("Control", "Exp", "Sec8"))
mto$ra_site <- factor(mto$ra_site)

# Create subgroups
# Age at time of randomization
# Load in clean data

#########
# Created by this Stata do file
# * Get the year of random assignment out

# clear

# * Load the data
# import sas PPID ra_date using "[redacted]/mto_fin_roster.sas7bdat"

# * Get the year variable
# gen year_at_ra = year(ra_date)

# * Drop the original ra_date variable
drop ra_date 

# * Output
# export delimited using "/projects/data/analysis_data/ra_year_variable.csv", replace

ra_year <- read.csv("/projects/data/analysis_data/ra_year_variable.csv")


# Merge into the mto dataset
mto <- merge(x = mto, y = ra_year, all.x = TRUE, by = "PPID")

# Make a variable for year of birthday
mto$birth_year <- ifelse(is.na(mto$dob_year), mto$f_svy_yob_imp,
                         as.numeric(mto$dob_year))
mto$age_at_ra <- mto$year_at_ra - mto$birth_year
summary(mto$age_at_ra)


# Create the age groups
# Look at the imputed age at baseline variable
mto$age_groups_ac <- NA
mto$age_groups_ac[mto$age_at_ra > 18 | mto$f_svy_sample2007 == "AD" |
                    mto$f_svy_sample2007 == "ES"] <- "Adult"
mto$age_groups_ac[mto$age_at_ra <= 18 & mto$age_at_ra >= 13] <- "Age 13-18"
mto$age_groups_ac[mto$age_at_ra < 13] <- "Age < 13"
# Look at the groups
table(mto$age_groups_ac, useNA = "always")

# Make a table for checking out the duplicates and missings later
mto_check <- mto[,c("pik_missing_dup", "f_wt_totcore98", "FAMID", "age_groups_ac",
                    "PPID", "ra_site", "ra_group", "exp_group")]

# Make an "all" respondent variable
mto$all <- TRUE

# Make a function for rounding the N

count_round <- function(x) {
  if (x < 15) {
    "N < 15"
  } else if (x >= 15 & x < 100) {
    plyr::round_any(x, accuracy=10)
  } else if (x >= 100 & x < 1000) {
    plyr::round_any(x, accuracy = 50)
  } else if (x >= 1000 & x < 10000) {
    plyr::round_any(x, accuracy = 100)
  } else if (x >= 10000 & x < 100000) {
    plyr::round_any(x, accuracy = 500)
  } else if (x >= 100000 & x < 1000000) {
    plyr::round_any(x, accuracy = 1000)
  } else {
    signif(x, digits = 4)
  }
}

# Remove the missing or duplicated
mto <- mto[!mto$pik_missing_dup,]
save(mto, file="/projects/data/analysis_data/mto_clean.RData")

# Remove datasets we don't need anymore
rm("mto_id")
rm("mto_r")
rm("ra_year")

# Load in the clean L2 data
load("/projects/data/analysis_data/l2_clean.RData")

# Merge with the L2 data
d <- merge(x = mto, y = l2_clean, all.x = TRUE, by="pik")
save(d, file="/projects/data/analysis_data/merged_mto_l2_clean.RData")
load("/projects/data/analysis_data/merged_mto_l2_clean.RData")

# Remove the datasets we don't need
rm("mto")
rm("l2_clean")

# Merge in the date-of-birth data
# Load the DOB data
dob <- read_sas("[redacted]/mto1994_2010_fulldob.sas7bdat")
names(dob)[names(dob) == "ppid"] <- "PPID"
# Merge with the dataset "d"
d <- merge(x = d, y = dob, by = "PPID", all.x = TRUE)
d$dob_f <- as.Date(d$dob, tryFormats = "%m/%d/%Y")
# Remove the dob data.frame
rm("dob")

# Load the voting eligibility dataset
# Dataset contains the election days in November for each year from 2000 to 2019
election_days <- read.csv("/projects/data/analysis_data/election_days.csv", 
                          stringsAsFactors = FALSE)
election_days$CanVoteDOB <- as.Date(election_days$CanVoteDOB, tryFormats = "%m/%d/%Y")

# Create the variables for whether someone is eligible to vote or not
for (year in election_days$Year) {
  d[,paste0("canvote_", year)] <- d$dob_f <= election_days$CanVoteDOB[election_days$Year == year]
}


# Make the summary statistics and balance tests 
# Load in the MTO final analysis file
# This contains the baseline covariates that we will need to use
fa_d <- read_sas("[redacted]/mto_fin_analysis.sas7bdat")
# names(fa_d)[grepl(pattern="X_F_HH", x=names(fa_d))]
fa_d$in_d <- fa_d$PPID %in% d$PPID
summary(fa_d$in_d)

# Create a varaible for whether the subjects in d are in the final analysis file
d$in_fa_d <- d$PPID %in% fa_d$PPID

# Create the variables for the IV analysis
d$exp_group_Exp <- d$exp_group == "Exp"
d$exp_group_Sec8 <- d$exp_group == "Sec8"
d$cmove_exp <- d$f_svy_cmove == 1 & d$exp_group == "Exp"
d$cmove_sec8 <- d$f_svy_cmove == 1 & d$exp_group == "Sec8"

# Clean up the voting outcome data

# Outcome: in the voter file
d$in_voter_file <- !is.na(d$l2_num_records)

# General election 
# Set what's missing to FALSE
d$General_2000_b[is.na(d$General_2000_b)] <- FALSE
d$General_2002_b[is.na(d$General_2002_b)] <- FALSE
d$General_2004_b[is.na(d$General_2004_b)] <- FALSE
d$General_2006_b[is.na(d$General_2006_b)] <- FALSE
d$General_2008_b[is.na(d$General_2008_b)] <- FALSE
d$General_2010_b[is.na(d$General_2010_b)] <- FALSE
d$General_2012_b[is.na(d$General_2012_b)] <- FALSE
d$General_2014_b[is.na(d$General_2014_b)] <- FALSE
d$General_2016_b[is.na(d$General_2016_b)] <- FALSE
d$General_2018_b[is.na(d$General_2018_b)] <- FALSE

# Number of elections one is eligible to vote in
d$eligible_all_elections <- rowSums(d[,paste0("canvote_", 2000:2019)])
# Create the odd/even years variable
president_years <- seq(from=2000, to = 2018, by=4)
midterm_years <- seq(from=2002, to = 2020, by=4)
even_years <- seq(from=2000, to = 2018, by=2)
odd_years <- seq(from=2001, to = 2019, by=2)
# Number of even-year elections one is eligible to vote in
d$eligible_even_elections <- rowSums(d[,paste0("canvote_", even_years)])
# Number of odd-year elections one is eligible to vote in
d$eligible_odd_elections <- rowSums(d[,paste0("canvote_", odd_years)])
# Number of presidential elections one is eligible to vote in
d$eligible_presidential_elections <- rowSums(d[,paste0("canvote_", president_years)])
# Number of midterm elections one is eligible to vote in
d$eligible_midterm_elections <- rowSums(d[,paste0("canvote_", midterm_years)])


# Generate the proportion of national-level general elections voted in for which one is eligible to vote in
foo <- rowSums(d[,paste0("General_", even_years, "_b")])
d$General_total <- rowSums(d[,paste0("General_", even_years, "_b")])
d$General_prop <- d$General_total/d$eligible_even_elections
# Generate the proportion of presidential elections voted in
d$President_total <-  rowSums(d[,paste0("General_", president_years, "_b")])
d$President_prop <- d$President_total/d$eligible_presidential_elections
# Generate the proportion of midterm elections voted in
d$Midterm_total <-  rowSums(d[,paste0("General_", midterm_years, "_b")])
d$Midterm_prop <- d$Midterm_total/d$eligible_midterm_elections

# [redacted] people have proportions greater than 1 -- set these people to 1
# General federal elections
d$General_prop_err <- d$General_prop > 1
d$General_prop[d$General_prop_err] <- 1
# Presidential elections
d$President_prop_err <- d$President_prop > 1
d$President_prop[d$President_prop_err] <- 1
# Midterm elections
d$Midterm_prop_err <- d$Midterm_prop > 1
d$Midterm_prop[d$Midterm_prop_err] <- 1

# Make the covariates for the covariates analysis

# Create the final analysis merged dataset
df <- merge(x = d, y = fa_d, by="PPID")

# Add age to df
df$age <- as.numeric((as.Date("07/01/2021", tryFormats = "%m/%d/%Y") - df$dob_f)/365)
df$age <- floor(df$age)

# The baseline covariates according to HUD report
# Adult
hud_adult_cov <- toupper(c("x_f_ad_36_40", "x_f_ad_41_45", "x_f_ad_46_50", 
                           "x_f_ad_edged", "x_f_ad_edgradhs", "x_f_ad_edgradhs_miss",
                           "x_f_ad_edinsch", "x_f_ad_ethn_hisp", "x_f_ad_le_35",
                           "x_f_ad_male", "x_f_ad_nevmarr", "x_f_ad_parentu18",
                           "x_f_ad_race_black", "x_f_ad_race_other", "x_f_ad_working",
                           "x_f_hh_afdc", "x_f_hh_car", "x_f_hh_disabl", "x_f_hh_noteens",
                           "x_f_hh_size2", "x_f_hh_size3", "x_f_hh_size4", "x_f_hh_victim",
                           "x_f_hood_5y", "x_f_hood_chat", "x_f_hood_nbrkid",
                           "x_f_hood_nofamily", "x_f_hood_nofriend", "x_f_hood_unsafenit",
                           "x_f_hood_verydissat", "x_f_hous_fndapt", "x_f_hous_mov3tm",
                           "x_f_hous_movdrgs", "x_f_hous_movschl", "x_f_hous_sec8bef",
                           "x_f_site_balt", "x_f_site_bos", "x_f_site_chi", "x_f_site_la"))
# Child
hud_child_cov <- paste0("X_F_", c("CH_MALE", "CH_SCHPLAY", "CH_SPECMED",
                                  "CH_SCHPLAY_MISS", "C1_EXPEL", "C1_BEHPRB",
                                  "C1_GIFTED", "C1_LRNPRB", "C1_SCHCLL", "C2_HOSP",
                                  "C2_LOWBW", "C2_READ", "C1_BEHPRB_MISS",
                                  "C1_EXPEL_MISS", "C1_GIFTED_MISS", "C1_LRNPRB_MISS",
                                  "C2_HOSP_MISS", "C2_LOWBW_MISS", "C2_READ_MISS",
                                  "HH_AFDC", "HH_CAR", "HH_DISABL", "HH_NOTEENS",
                                  "HH_SIZE2", "HH_SIZE3", "HH_SIZE4", "HH_VICTIM",
                                  "HOOD_5Y", "HOOD_CHAT", "HOOD_NBRKID", "HOOD_NOFAMILY",
                                  "HOOD_NOFRIEND", "HOOD_UNSAFENIT", "HOOD_VERYDISSAT",
                                  "HOUS_FNDAPT", "HOUS_MOV3TM", "HOOD_VERYDISSAT",
                                  "HOUS_MOVSCHL", "HOUS_SEC8BEF",
                                  "AD_EDGED", "AD_EDGRADHS", "AD_EDINSCH",
                                  "AD_ETHN_HISP", "AD_MALE", "AD_NEVMARR",
                                  "AD_PARENTU18", "AD_RACE_BLACK", "AD_RACE_OTHER",
                                  "AD_WORKING", "AD_EDGRADHS_MISS"))
hud_child_cov <- c(hud_child_cov, "age_at_ra", "age",
                   toupper(c("x_f_site_balt", "x_f_site_bos", 
                             "x_f_site_chi", "x_f_site_la")))

table(df$X_F_C1_GIFTED[df$age_groups_ac != "Adult"], useNA="always")
table(df$X_F_C1_GIFTED_MISS[df$age_groups_ac != "Adult"], useNA="always")

cov_combined <- c(hud_adult_cov, hud_child_cov)
cov_combined <- unique(cov_combined)
write.csv(cov_combined, "/projects/data/analysis_data/baseline_covariates.csv")

# Clean up the FAMID in df
table(df$FAMID.x == df$FAMID.y, useNA="always")
df$FAMID.y <- NULL
names(df)[names(df) == "FAMID.x"] <- "FAMID"

# Check all the child covariates are in the df
hud_child_cov_c <- data.frame(hud_child_cov, check = hud_child_cov %in% names(df))
hud_child_cov_c[!hud_child_cov_c$check,]

# Create a variable that combines the experimental group and whether the subject moved
d$exp_group_move <- paste0(d$exp_group, "-", d$f_svy_cmove)
df$exp_group_move <- paste0(df$exp_group, "-", df$f_svy_cmove)

# Load the analysis file 
primary_analysis <- read.csv("/projects/data/analysis_data/disclosure_primary_outcomes.csv",
                             stringsAsFactors = FALSE)

#############################################
# Main regression analysis

# ITT analysis function to run the analysis over all the variables (adults)

itt_analysis_function <- function(variable_number,
                                  covariate_terms = NULL,
                                  covariates_baseline = FALSE,
                                  analysis_subgroup = d$age_groups_ac == "Adult",
                                  analysis_subgroup_label = "Adult",
                                  my_data = d,
                                  weights = d$f_wt_totcore98,
                                  clusters = d$FAMID,
                                  include_N = TRUE,
                                  subgroup_file_label = "adult",
                                  marginal_means = TRUE,
                                  sample_name = "1.A",
                                  explicit_sample_desc = "adult",
                                  implicit_sample_desc = "non-adult",
                                  vote_eligibility_n = NULL,
                                  vote_eligibility_cluster = NULL
) {
  itt_function(outcome= primary_analysis$outcome[variable_number], 
               outcome_label = primary_analysis$outcome_label[variable_number], 
               covariates_baseline = covariates_baseline, 
               covariate_terms = covariate_terms,
               subgroup= analysis_subgroup,
               subgroup_label = analysis_subgroup_label,
               my_data = my_data, 
               weights = weights, 
               clusters = clusters, 
               include_N = include_N,
               subgroup_file_label = subgroup_file_label,
               marginal_means = marginal_means,
               sample_name = sample_name, 
               output_number = variable_number,
               explicit_sample_desc = explicit_sample_desc,
               implicit_sample_desc = implicit_sample_desc,
               vote_eligibility_n = vote_eligibility_n,
               vote_eligibility_cluster = vote_eligibility_cluster)
}

# IV analysis function to run the analysis over all the variables (adults)
iv_analysis_function <- function(variable_number,
                                 covariate_terms = NULL,
                                 covariates_baseline = FALSE,
                                 analysis_subgroup = d$age_groups_ac == "Adult",
                                 analysis_subgroup_label = "Adult",
                                 my_data = d,
                                 weights = d$f_wt_totcore98,
                                 clusters = d$FAMID,
                                 include_N = TRUE,
                                 subgroup_file_label = "adult",
                                 marginal_means = FALSE,
                                 sample_name = "1.A",
                                 explicit_sample_desc = "adult",
                                 implicit_sample_desc = "non-adult",
                                 vote_eligibility_n = NULL,
                                 vote_eligibility_cluster = NULL) {
  iv_function(outcome= primary_analysis$outcome[variable_number], 
              outcome_label = primary_analysis$outcome_label[variable_number], 
              covariates_baseline = covariates_baseline, 
              covariate_terms = covariate_terms,
              subgroup= analysis_subgroup,
              subgroup_label = analysis_subgroup_label,
              my_data = my_data, 
              weights = weights, 
              clusters = clusters, 
              include_N = include_N,
              subgroup_file_label = subgroup_file_label,
              marginal_means = marginal_means,
              sample_name = sample_name, 
              output_number = variable_number,
              explicit_sample_desc = explicit_sample_desc,
              implicit_sample_desc = implicit_sample_desc,
              vote_eligibility_n = vote_eligibility_n,
              vote_eligibility_cluster = vote_eligibility_cluster)
}

primary_analysis <- primary_analysis[primary_analysis$outcome != "pik_missing_dup",]

# Adults: no covariate baseline
# ITT
lapply(1:nrow(primary_analysis), itt_analysis_function)
# TOT
lapply(1:nrow(primary_analysis), iv_analysis_function)

# Adult: with baseline covariates
# ITT
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = TRUE,
       covariate_terms = hud_adult_cov,
       sample_name = "2.A",
       analysis_subgroup = df$age_groups_ac == "Adult",
       analysis_subgroup_label = "Adult",
       my_data = df, 
       weights = df$f_wt_totcore98, 
       marginal_means = FALSE,
       clusters = df$FAMID, 
)
# TOT
lapply(1:nrow(primary_analysis), iv_analysis_function,
       covariates_baseline = TRUE,
       covariate_terms = hud_adult_cov,
       sample_name = "2.A",
       analysis_subgroup = df$age_groups_ac == "Adult",
       analysis_subgroup_label = "Adult",
       my_data = df, 
       weights = df$f_wt_totcore98, 
       marginal_means = FALSE,
       clusters = df$FAMID, 
)

# Run the regressions: Age < 13
# ITT
for (i in 1:nrow(primary_analysis)) {
  print(primary_analysis$outcome_label[i])
  # Subgroup analysis: need to figure out who is eligible to vote in what year 
  # d  
  if (!is.na(primary_analysis$election_year[i])) {
    election_year <- primary_analysis$election_year[i]
    subgroup_analysis_d <- d$age_groups_ac == "Age < 13" & d[,paste0("canvote_", election_year)]
    age_groups_not_eligible_d <- d$age_groups_ac == "Age < 13" & d[,paste0("canvote_", election_year)] == FALSE
    # generate the eligibility n and cluster
    my_vote_eligibility_d_n <- sum(age_groups_not_eligible_d)
    my_vote_eligibility_d_cluster <- length(unique(d$FAMID[age_groups_not_eligible_d]))
  } else {
    subgroup_analysis_d <- d$age_groups_ac == "Age < 13"
    my_vote_eligibility_d_n <- NULL
    my_vote_eligibility_d_cluster <- NULL
  }
  # df
  if (!is.na(primary_analysis$election_year[i])) {
    election_year <- primary_analysis$election_year[i]
    subgroup_analysis_df <- df$age_groups_ac == "Age < 13" & df[,paste0("canvote_", election_year)]
    age_groups_not_eligible_df <- df$age_groups_ac == "Age < 13" & df[,paste0("canvote_", election_year)] == FALSE
    # generate the eligibility n and cluster
    my_vote_eligibility_df_n <- sum(age_groups_not_eligible_df)
    my_vote_eligibility_df_cluster <- length(unique(d$FAMID[age_groups_not_eligible_df]))
  } else {
    subgroup_analysis_df <- df$age_groups_ac == "Age < 13"
    my_vote_eligibility_df_n <- NULL
    my_vote_eligibility_df_cluster <- NULL
  }
  outcome_variable <- primary_analysis$outcome[i]
  outcome_y <- d[,outcome_variable]
 
# Without baseline covariates 
  if (sum(subgroup_analysis_d) > [redacted] & sum(outcome_y[subgroup_analysis_d]) > 3) {
    # ITT
    itt_analysis_function(variable_number = i, 
                          include_N = TRUE, 
                          sample_name = primary_analysis$child_less_13_fr[i],
                          covariates_baseline = FALSE,
                          analysis_subgroup = subgroup_analysis_d,
                          analysis_subgroup_label = "Age < 13",
                          my_data = d, 
                          weights = d$f_wt_totcore98, 
                          marginal_means = TRUE,
                          clusters = d$FAMID,
                          explicit_sample_desc = "children less than 13 years of age",
                          implicit_sample_desc = "adults; children 13 to 18",
                          vote_eligibility_n = my_vote_eligibility_d_n,
                          vote_eligibility_cluster = my_vote_eligibility_d_cluster)
    # IV
    iv_analysis_function(variable_number = i, 
                         include_N = TRUE, 
                         sample_name = primary_analysis$child_less_13_fr[i],
                         covariates_baseline = FALSE,
                         analysis_subgroup = subgroup_analysis_d,
                         analysis_subgroup_label = "Age < 13",
                         my_data = d, 
                         weights = d$f_wt_totcore98, 
                         marginal_means = FALSE,
                         clusters = d$FAMID,
                         explicit_sample_desc = "children less than 13 years of age",
                         implicit_sample_desc = "adults; children 13 to 18",
                         vote_eligibility_n = my_vote_eligibility_d_n,
                         vote_eligibility_cluster = my_vote_eligibility_d_cluster)
  }
  # With baseline covariates 
  if (sum(subgroup_analysis_df) > [redacted] & sum(outcome_y[subgroup_analysis_df]) > 3) {
    # ITT
    itt_analysis_function(variable_number = i, 
                          include_N = TRUE, 
                          sample_name = primary_analysis$child_less_13_fa[i],
                          covariates_baseline = TRUE,
                          covariate_terms = hud_child_cov, 
                          analysis_subgroup = subgroup_analysis_df,
                          analysis_subgroup_label = "Age < 13",
                          my_data = df, 
                          weights = df$f_wt_totcore98, 
                          marginal_means = FALSE,
                          clusters = df$FAMID,
                          explicit_sample_desc = "children less than 13 years of age",
                          implicit_sample_desc = "adults; children 13 to 18",
                          vote_eligibility_n = my_vote_eligibility_df_n,
                          vote_eligibility_cluster = my_vote_eligibility_df_cluster
    )
    # IV
    iv_analysis_function(variable_number = i, 
                         include_N = TRUE, 
                         sample_name = primary_analysis$child_less_13_fa[i],
                         covariates_baseline = TRUE,
                         covariate_terms = hud_child_cov, 
                         analysis_subgroup = subgroup_analysis_df,
                         analysis_subgroup_label = "Age < 13",
                         my_data = df, 
                         weights = df$f_wt_totcore98, 
                         marginal_means = FALSE,
                         clusters = df$FAMID,
                         explicit_sample_desc = "children less than 13 years of age",
                         implicit_sample_desc = "adults; children 13 to 18",
                         vote_eligibility_n = my_vote_eligibility_df_n,
                         vote_eligibility_cluster = my_vote_eligibility_df_cluster
    )
  }                   
  
}

# Run the regressions: Age 13 to 18
for (i in 1:nrow(primary_analysis)) {
  print(primary_analysis$outcome_label[i])
  # Subgroup analysis: need to figure out who is eligible to vote in what year
  # d  
  if (!is.na(primary_analysis$election_year[i])) {
    election_year <- primary_analysis$election_year[i]
    subgroup_analysis_d <- d$age_groups_ac == "Age 13-18" & d[,paste0("canvote_", election_year)]
    age_groups_not_eligible_d <- d$age_groups_ac == "Age 13-18" & d[,paste0("canvote_", election_year)] == FALSE
    # generate the eligibility n and cluster
    my_vote_eligibility_d_n <- sum(age_groups_not_eligible_d)
    my_vote_eligibility_d_cluster <- length(unique(d$FAMID[age_groups_not_eligible_d]))
  } else {
    subgroup_analysis_d <- d$age_groups_ac == "Age 13-18"
    my_vote_eligibility_d_n <- NULL
    my_vote_eligibility_d_cluster <- NULL
  }
  # df
  if (!is.na(primary_analysis$election_year[i])) {
    election_year <- primary_analysis$election_year[i]
    subgroup_analysis_df <- df$age_groups_ac == "Age 13-18" & df[,paste0("canvote_", election_year)]
    age_groups_not_eligible_df <- df$age_groups_ac == "Age 13-18" & df[,paste0("canvote_", election_year)] == FALSE
    # generate the eligibility n and cluster
    my_vote_eligibility_df_n <- sum(age_groups_not_eligible_df)
    my_vote_eligibility_df_cluster <- length(unique(d$FAMID[age_groups_not_eligible_df]))
  } else {
    subgroup_analysis_df <- df$age_groups_ac == "Age 13-18"
    my_vote_eligibility_df_n <- NULL
    my_vote_eligibility_df_cluster <- NULL
  }
  
  outcome_variable <- primary_analysis$outcome[i]
  outcome_y <- d[,outcome_variable]
  # Without baseline covariates 
  if (sum(subgroup_analysis_d) > [redacted] & sum(outcome_y[subgroup_analysis_d]) > 3) {
    # ITT
    itt_analysis_function(variable_number = i, 
                          include_N = TRUE, 
                          sample_name = primary_analysis$child_13_18_fr[i],
                          covariates_baseline = FALSE,
                          analysis_subgroup = subgroup_analysis_d,
                          analysis_subgroup_label = "Age 13-18",
                          my_data = d, 
                          weights = d$f_wt_totcore98, 
                          marginal_means = TRUE,
                          clusters = d$FAMID,
                          explicit_sample_desc = "children 13 to 18",
                          implicit_sample_desc = "adults; children less than 13 years of age",
                          vote_eligibility_n = my_vote_eligibility_d_n,
                          vote_eligibility_cluster = my_vote_eligibility_d_cluster)
    # IV
    iv_analysis_function(variable_number = i, 
                         include_N = TRUE, 
                         sample_name = primary_analysis$child_13_18_fr[i],
                         covariates_baseline = FALSE,
                         analysis_subgroup = subgroup_analysis_d,
                         analysis_subgroup_label = "Age 13-18",
                         my_data = d, 
                         weights = d$f_wt_totcore98, 
                         marginal_means = FALSE,
                         clusters = d$FAMID,
                         explicit_sample_desc = "children 13 to 18",
                         implicit_sample_desc = "adults; children less than 13 years of age",
                         vote_eligibility_n = my_vote_eligibility_d_n,
                         vote_eligibility_cluster = my_vote_eligibility_d_cluster)
  }
  # With baseline covariates 
  if (sum(subgroup_analysis_df) > [redacted] & sum(outcome_y[subgroup_analysis_df]) > 3) {
    # ITT
    itt_analysis_function(variable_number = i, 
                          include_N = TRUE, 
                          sample_name = primary_analysis$child_13_18_fa[i],
                          covariates_baseline = TRUE,
                          covariate_terms = hud_child_cov, 
                          analysis_subgroup = subgroup_analysis_df,
                          analysis_subgroup_label = "Age 13-18",
                          my_data = df, 
                          weights = df$f_wt_totcore98, 
                          marginal_means = FALSE,
                          clusters = df$FAMID,
                          explicit_sample_desc = "children 13 to 18",
                          implicit_sample_desc = "adults; children less than 13 years of age",
                          vote_eligibility_n = my_vote_eligibility_d_n,
                          vote_eligibility_cluster = my_vote_eligibility_d_cluster
    )
    # IV
    iv_analysis_function(variable_number = i, 
                         include_N = TRUE, 
                         sample_name = primary_analysis$child_13_18_fa[i],
                         covariates_baseline = TRUE,
                         covariate_terms = hud_child_cov, 
                         analysis_subgroup = subgroup_analysis_df,
                         analysis_subgroup_label = "Age 13-18",
                         my_data = df, 
                         weights = df$f_wt_totcore98, 
                         marginal_means = FALSE,
                         clusters = df$FAMID,
                         explicit_sample_desc = "children 13 to 18",
                         implicit_sample_desc = "adults; children less than 13 years of age",
                         vote_eligibility_n = my_vote_eligibility_d_n,
                         vote_eligibility_cluster = my_vote_eligibility_d_cluster
    )
  }                   
  
}

# Checking for the percentage of subjects with missing PIKs

primary_analysis_c <- read.csv("/projects/data/analysis_data/disclosure_primary_outcomes.csv",
                               stringsAsFactors = FALSE)

# Function 
check_pik <- function(subgroup, subgroup_label, subgroup_file_label,
                      sample_name, explicit_sample_desc, implicit_sample_desc) {
  itt_function(outcome = "pik_missing_dup", no_site=FALSE, 
               explicit_sample_desc = explicit_sample_desc, 
               implicit_sample_desc = implicit_sample_desc, 
               output_number = 11, 
               sample_name = sample_name, 
               include_N = TRUE, 
               marginal_means = TRUE, 
               clusters = mto_check$FAMID,
               weights = mto_check$f_wt_totcore98,
               my_data = mto_check, 
               subgroup_file_label = subgroup_file_label, 
               subgroup_label = subgroup_label, 
               subgroup = subgroup, 
               covariate_terms = NULL, 
               covariates_baseline = FALSE,
               outcome_label = primary_analysis_c$outcome_label[11])
}

# Adult
check_pik(subgroup=mto_check$age_groups_ac == "Adult", 
          sample_name="3.A", 
          subgroup_label="Adult", 
          subgroup_file_label="adult",
          explicit_sample_desc = "adult",
          implicit_sample_desc = "non-adult")

# Age < 13
check_pik(subgroup=mto_check$age_groups_ac == "Age < 13", 
          sample_name = "3.B", 
          subgroup_label = "Age < 13", 
          subgroup_file_label = "age_less_13",
          explicit_sample_desc = "children less than 13 years of age",
          implicit_sample_desc = "adults; children 13 to 18")

# Age 13-18
check_pik(subgroup=mto_check$age_groups_ac == "Age 13-18", 
          sample_name = "3.C", 
          subgroup_label = "Age 13-18", 
          subgroup_file_label = "age_13_18",
          explicit_sample_desc = "children 13 to 18",
          implicit_sample_desc = "adults; children less than 13 years of age")

#############################################
# Analysis by demographic subgroups

# Clean up the race variable in the small dataset (df) 
df$dem_sex_female <- df$sex == "F"
df$dem_race_black <- df$X_F_AD_RACE_BLACK
df$dem_race_hispanic <- df$X_F_AD_ETHN_HISP
df$age_groups_ac
# Clean up the race variable in the bigger dataset
# Clean up the Hispanic variable
d$hispanic <- ifelse(d$f_svy_ethnic == 1, 1, 0)
d$hispanic[is.na(d$hispanic)] <- mean(d$hispanic[!is.na(d$hispanic)])
# Clean up the African American variable
d$black <- ifelse(d$f_svy_race == 1 & d$f_svy_ethnic != 1, 1, 0)
d$black[is.na(d$black)] <- mean(d$black[!is.na(d$black)])
# Clean up the sex variable
d$sex_F <- ifelse(d$sex == "F", 1, 0)

# F-test
# Make the model
f_test_formula <- function(outcome_var) {
  paste0(outcome_var, " ~ exp_group_Sec8 + exp_group_Exp +
            ra_site + dem_sex_female + dem_race_black + dem_race_hispanic +
            age_groups_ac + 
            ra_site:exp_group_Sec8 + dem_sex_female:exp_group_Sec8 + 
            dem_race_black:exp_group_Sec8 + dem_race_hispanic:exp_group_Sec8 +
            age_groups_ac:exp_group_Sec8 +
            ra_site:exp_group_Exp + dem_sex_female:exp_group_Exp + 
            dem_race_black:exp_group_Exp + dem_race_hispanic:exp_group_Exp +
            age_groups_ac:exp_group_Exp")
}

# In voter file
f_test_in_voter_file <- lm_robust(formula = as.formula(f_test_formula("in_voter_file")),
                                  data = df, weights = df$f_wt_totcore98, cluster = df$FAMID)
f_test_General_prop <- lm_robust(formula = as.formula(f_test_formula("General_prop")),
                                  data = df, weights = df$f_wt_totcore98, cluster = df$FAMID)
f_test_President_prop <- lm_robust(formula = as.formula(f_test_formula("President_prop")),
                                 data = df, weights = df$f_wt_totcore98, cluster = df$FAMID)
f_test_Midterm_prop <- lm_robust(formula = as.formula(f_test_formula("Midterm_prop")),
                                 data = df, weights = df$f_wt_totcore98, cluster = df$FAMID)


# F-tests function
lhypothesis_func <- function(model) {
  sec8 <- car::linearHypothesis(model, c("exp_group_Sec8TRUE:ra_site2 = 0",
                                                "exp_group_Sec8TRUE:ra_site3 = 0",
                                                "exp_group_Sec8TRUE:ra_site4 = 0",
                                                "exp_group_Sec8TRUE:ra_site5 = 0",
                                                "exp_group_Sec8TRUE:dem_sex_femaleTRUE = 0",
                                                "exp_group_Sec8TRUE:dem_race_black = 0",
                                                "exp_group_Sec8TRUE:dem_race_hispanic = 0",
                                                "exp_group_Sec8TRUE:age_groups_acAge < 13 = 0",
                                                "exp_group_Sec8TRUE:age_groups_acAge 13-18 = 0"))
  exp <- car::linearHypothesis(model, c("exp_group_ExpTRUE:ra_site2 = 0",
                                         "exp_group_ExpTRUE:ra_site3 = 0",
                                         "exp_group_ExpTRUE:ra_site4 = 0",
                                         "exp_group_ExpTRUE:ra_site5 = 0",
                                         "exp_group_ExpTRUE:dem_sex_femaleTRUE = 0",
                                         "exp_group_ExpTRUE:dem_race_black = 0",
                                         "exp_group_ExpTRUE:dem_race_hispanic = 0",
                                         "exp_group_ExpTRUE:age_groups_acAge < 13 = 0",
                                         "exp_group_ExpTRUE:age_groups_acAge 13-18 = 0"))
  treatments0 <- car::linearHypothesis(model, c("exp_group_Sec8TRUE = 0",
                                                "exp_group_ExpTRUE = 0"))
  
  # Return results
  return(c(sec8 = signif(sec8[2,"Pr(>Chisq)"], 4),
         exp = signif(exp[2,"Pr(>Chisq)"], 4),
         treatments0 = signif(treatments0[2,"Pr(>Chisq)"], 4)))
                                       
}

# F-test results
lhypothesis_func(model = f_test_in_voter_file)
lhypothesis_func(model = f_test_General_prop)
lhypothesis_func(model = f_test_President_prop)
lhypothesis_func(model = f_test_Midterm_prop)


# Adult: by sex (ITT)
# Female
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "1.A.female",
       analysis_subgroup = d$age_groups_ac == "Adult" & d$sex == "F",
       analysis_subgroup_label = "Adult, Sex: Female",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "adult_sex_female",
       marginal_means = TRUE,
       explicit_sample_desc = "adult, sex: female",
       implicit_sample_desc = "non-adult, sex: male",
       clusters = d$FAMID
)
# Male
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "1.A.male",
       analysis_subgroup = d$age_groups_ac == "Adult" & d$sex == "M",
       analysis_subgroup_label = "Adult, Sex: Male",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "adult_sex_male",
       marginal_means = TRUE,
       explicit_sample_desc = "adult, sex: male",
       implicit_sample_desc = "non-adult, sex: female",
       clusters = d$FAMID
)


# Children less than 13: by sex (ITT) 
# Female
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "2.A.female",
       analysis_subgroup = d$age_groups_ac == "Age < 13" & d$sex == "F",
       analysis_subgroup_label = "Age < 13; Sex: Female",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "child_less_13_sex_female",
       marginal_means = TRUE,
       explicit_sample_desc = "children less than 13 years of age, sex: female",
       implicit_sample_desc = "adults; children 13 to 18; children less than 13 years of age male",
       clusters = d$FAMID
)
# Male
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "2.A.male",
       analysis_subgroup = d$age_groups_ac == "Age < 13" & d$sex == "M",
       analysis_subgroup_label = "Age < 13; Sex: Male",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "child_less_13_sex_male",
       marginal_means = TRUE,
       explicit_sample_desc = "children less than 13 years of age, sex: male",
       implicit_sample_desc = "adults; children 13 to 18; children less than 13 years of age female",
       clusters = d$FAMID
)

# Children: 13-18: by sex (ITT)
# Female
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "3.A.female",
       analysis_subgroup = d$age_groups_ac == "Age 13-18" & d$sex == "F",
       analysis_subgroup_label = "Age 13-18; Sex: Female",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "child_13_18_sex_female",
       marginal_means = TRUE,
       explicit_sample_desc = "children 13 to 18, sex: female",
       implicit_sample_desc = "adults; children less than 13 years of age; children 13 to 18 male",
       clusters = d$FAMID
)
# Male
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "3.A.male",
       analysis_subgroup = d$age_groups_ac == "Age 13-18" & d$sex == "M",
       analysis_subgroup_label = "Age 13-18; Sex: Male",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "child_13_18_sex_male",
       marginal_means = TRUE,
       explicit_sample_desc = "children 13 to 18, sex: male",
       implicit_sample_desc = "adults; children less than 13 years of age; children 13 to 18 female",
       clusters = d$FAMID
)


# Adult: by race, ethnicity (ITT)
# Hispanic
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "1.A.hispanic",
       analysis_subgroup = d$age_groups_ac == "Adult" & d$hispanic == 1,
       analysis_subgroup_label = "Adult, Ethnicity: Hispanic",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "adult_hispanic",
       marginal_means = TRUE,
       explicit_sample_desc = "adult, Hispanic",
       implicit_sample_desc = "non-adult, adults non-Hispanic",
       clusters = d$FAMID
)
# Black
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "1.A.black",
       analysis_subgroup = d$age_groups_ac == "Adult" & d$black == 1,
       analysis_subgroup_label = "Adult, Race: Black, non-Hispanic",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "adult_black",
       marginal_means = TRUE,
       explicit_sample_desc = "adult, Black, non-Hispanic",
       implicit_sample_desc = "non-adult; adults not Black, non-Hispanic",
       clusters = d$FAMID
)
# not Hispanic or Black
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "1.A.other",
       analysis_subgroup = d$age_groups_ac == "Adult" & d$black != 1 & d$hispanic != 1,
       analysis_subgroup_label = "Adult, Race/Ethnicity: not Black, non-Hispanic",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "adult_black",
       marginal_means = TRUE,
       explicit_sample_desc = "adult, not Black, non-Hispanic",
       implicit_sample_desc = "non-adult; adults Black or Hispanic",
       clusters = d$FAMID
)

# Children less than 13: by race, ethnicity (ITT)
# Hispanic
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "2.A.hispanic",
       analysis_subgroup = d$age_groups_ac == "Age < 13" & d$hispanic == 1,
       analysis_subgroup_label = "Age < 13, Ethnicity: Hispanic",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "child_less_13_hispanic",
       marginal_means = TRUE,
       explicit_sample_desc = "children less than 13 years of age, Hispanic",
       implicit_sample_desc = "adult; children 13-18; children less than 13 years of age non-Hispanic",
       clusters = d$FAMID
)
# Black
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "2.A.black",
       analysis_subgroup = d$age_groups_ac == "Age < 13" & d$black == 1,
       analysis_subgroup_label = "Age < 13, Race: Black, non-Hispanic",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "child_less_13_black",
       marginal_means = TRUE,
       explicit_sample_desc = "children less than 13 years of age, Black, non-Hispanic",
       implicit_sample_desc = "adult; children 13-18; children less than 13 years of age not Black, non-Hispanic",
       clusters = d$FAMID
)
# not Hispanic or Black
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "2.A.other",
       analysis_subgroup = d$age_groups_ac == "Age < 13" & d$black != 1 & d$hispanic != 1,
       analysis_subgroup_label = "Age < 13, Race/Ethnicity: not Black, non-Hispanic",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "child_less_13_black",
       marginal_means = TRUE,
       explicit_sample_desc = "children less than 13 years of age, not Black, non-Hispanic",
       implicit_sample_desc = "adult; children 13-18; children less than 13 years of age, Black or Hispanic",
       clusters = d$FAMID
)


# Children: 13-18: by race, ethnicity (ITT)
# Hispanic
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "3.A.hispanic",
       analysis_subgroup = d$age_groups_ac == "Age 13-18" & d$hispanic == 1,
       analysis_subgroup_label = "Age 13-18, Ethnicity: Hispanic",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "child_13_18_hispanic",
       marginal_means = TRUE,
       explicit_sample_desc = "children 13-18, Hispanic",
       implicit_sample_desc = "adult; children less than 13 years of age; children 13-18 non-Hispanic",
       clusters = d$FAMID
)
# Black
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "3.A.black",
       analysis_subgroup = d$age_groups_ac == "Age 13-18" & d$black == 1,
       analysis_subgroup_label = "Age 13-18, Race: Black, non-Hispanic",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "child_13_18_black",
       marginal_means = TRUE,
       explicit_sample_desc = "children 13-18, Black, non-Hispanic",
       implicit_sample_desc = "adult; children less than 13 years of age; children 13-18 not Black, non-Hispanic",
       clusters = d$FAMID
)
# not Hispanic or Black
lapply(1:nrow(primary_analysis), itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "3.A.other",
       analysis_subgroup = d$age_groups_ac == "Age 13-18" & d$black != 1 & d$hispanic != 1,
       analysis_subgroup_label = "Age 13-18, Race/Ethnicity: not Black, non-Hispanic",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "child_13_18_black",
       marginal_means = TRUE,
       explicit_sample_desc = "children 13-18, not Black, non-Hispanic",
       implicit_sample_desc = "adult; children less than 13 years of age; children 13-18, Black or Hispanic",
       clusters = d$FAMID
)


################################################################
# Effect on self-reported being in jail or prison

# Youth jail or prison data
d$reported_jail <- NA
d$reported_jail[d$HHO10A == 1 & !is.na(d$HHO10A)] <- 1
d$reported_jail[d$HHO10A == 5 & !is.na(d$HHO10A)] <- 0

# Function to do the randomization based on block and cluster
cluster_block_rando <- function(ra_site) {
  block_select <- d$ra_site == ra_site
  exp_group_Control_n <- length(unique(d$famid.x[block_select & d$exp_group == "Control"]))
  exp_group_Exp_n <- length(unique(d$famid.x[block_select & d$exp_group_Exp]))
  exp_group_Sec8_n <- length(unique(d$famid.x[block_select & d$exp_group_Sec8]))
  famid_unique <- unique(d$famid.x[block_select])
  perm_sample <- sample(x=c(rep("Control", exp_group_Control_n),
                            rep("Exp", exp_group_Exp_n),
                            rep("Sec8", exp_group_Sec8_n)))
  return(data.frame(famid.x = famid_unique, perm_exp_group = perm_sample))
}

# Make a mini dataset
mini_d_o <- d[,c("reported_jail", "exp_group", "famid.x", "f_wt_raratio98",
                 "age_groups_ac", "ra_site")]

# All clusters assigned
perm_regression <- function(age_group_jail, num) {
  print(num)
  all_clusters_rando <- do.call(rbind, lapply(1:5, cluster_block_rando))
  all_clusters_rando$famid.x <- as.character(all_clusters_rando$famid.x)
  mini_d <- merge(x = mini_d_o, y = all_clusters_rando, all.x = TRUE)
  perm_reg <- lm_lin(formula=reported_jail~perm_exp_group, covariates=~ra_site, data=mini_d, 
                     weights=f_wt_raratio98, cluster = famid.x,
                     subset = mini_d$age_groups_ac == age_group_jail & 
                       !is.na(mini_d$reported_jail))
  return(perm_reg$coefficients[1:3])
}

# Function to generate the disclosure outputs, sample description, and memo
ri_report <- function(raw_perm,
                      outcome = "reported_jail",
                      outcome_label = "Self-reported incarceration",
                      covariates_baseline = FALSE,
                      subgroup = d$age_groups_ac == "Age < 13" & 
                        !is.na(d$reported_jail),
                      subgroup_label = "Age < 13",
                      explicit_sample_desc = "children less than 13 years of age with self-reported incarceration data",
                      implicit_sample_desc = "children 13 to 18 with self-reported incarceration data",
                      marginal_means = FALSE,
                      output_number = 12,
                      sample_name = "4.B") {
  
  pl = data.frame(do.call(rbind, raw_perm))
  
  # Make the file names
  file_number = paste0(output_number, ".", sample_name, ".RI")
  # Output file name
  output_file_name = paste0("o_", file_number)
  # Disclosure sample file name
  disclosure_sample_file_name = paste0("ds_", file_number)
  
  # Analyze the actual data
  rl = lm_lin(formula=reported_jail~exp_group, 
              covariates=~ra_site, 
              data=d, 
              weights=d$f_wt_raratio98, cluster = d$FAMID,
              subset = subgroup)
  
  # Get out the p-value
  # One-sided
  pl_1 = mean(pl$perm_exp_groupExp >= rl$coefficients[2])
  # Two-sided
  pl_2 = mean(abs(pl$perm_exp_groupExp) >= rl$coefficients[2])
  
  # Put together the output table
  output_d = data.frame(output_file_name = c(output_file_name, NA, NA), 
                        disclosure_sample_file_name = c(disclosure_sample_file_name, NA, NA),
                        sample_name = c(sample_name, NA, NA), 
                        outcome_variable = c(outcome, NA, NA),
                        outcome_label = c(outcome_label, NA, NA),
                        subgroup = c(subgroup_label, NA, NA),
                        baseline_covariates = c(covariates_baseline, NA, NA),
                        variable_name = c("Control mean", "Exp", "Sec8"),
                        coefficients = signif(rl$coefficients[1:3], 4), 
                        standard_errors = signif(rl$std.error[1:3], 4), 
                        p_values = signif(rl$p.value[1:3], 4), 
                        N = c(count_round(rl$N), NA, NA),
                        N_clusters = c(count_round(rl$N_clusters), NA, NA),
                        ri_p_value_1_sided = c(signif(pl_1, 4), NA, NA),
                        ri_p_value_2_sided = c(signif(pl_2, 4), NA, NA),
                        analysis_type = "ITT estimates; randomization inference", 
                        row.names=NULL)
  
  # Make the disclosure statistics
  
  explicit_desc = paste0("Explicit sample: ", 
                         explicit_sample_desc)
  # Overall counts
  explicit_n = data.frame(output_d[1, c("output_file_name", 
                                        "disclosure_sample_file_name",
                                        "outcome_variable")],
                          sample = explicit_desc,
                          participant_n = rl$N,
                          cluster_n = rl$N_clusters,
                          permutations = 5000)
  
  # Implicit sample
  implicit_n = data.frame(sample = paste0("Implicit sample: ",
                                          implicit_sample_desc),
                          participant_n = nrow(d[!subgroup,]),
                          cluster_n = length(unique(d$FAMID[!subgroup])))
  
  # By experimental group
  exp_group_n = d[subgroup,] %>% 
    group_by(experimental_group = exp_group) %>% dplyr::summarize(
      participant_n = n(),
      cluster_n = length(unique(FAMID))
    )
  exp_group_n = data.frame(sample = explicit_desc,
                           exp_group_n)
  
  desc_stat = plyr::rbind.fill(explicit_n, implicit_n, exp_group_n)
  
  # By experimental group and outcome if outcome is binary
  if (length(unique(d[subgroup,outcome])) == 2) {
    d$outcome_variable <- d[,outcome]
    outcome_exp_group_n <- d[subgroup,] %>% 
      group_by(outcome_variable,                     
               experimental_group = exp_group) %>% 
      summarize(
        participant_n = n(),
        cluster_n = length(unique(FAMID))
      )
    names(outcome_exp_group_n)[names(outcome_exp_group_n) == "outcome_variable"] <-
      "outcome_variable_value"
    outcome_exp_group_n <- data.frame(sample = explicit_desc,
                                      outcome_exp_group_n)
    
    desc_stat <- plyr::rbind.fill(desc_stat, outcome_exp_group_n)
  }
  
  # Output the regression results to a sheet in the Excel file
  
  output_file_dir = "/projects/data/disclosure_tables/"
  
  # Output file
  write.xlsx(x = output_d, row.names = FALSE, append = TRUE,
             file = paste0(output_file_dir, "output_tables/output_disclosure_june_2022.xlsx"), 
             sheetName = output_file_name)
  
  # Disclosure sample file
  write.xlsx(x = desc_stat, row.names = FALSE, append = TRUE,
             file = paste0(output_file_dir, "output_tables/output_disclosure_sample_june_2022.xlsx"), 
             sheetName = disclosure_sample_file_name)
  
  # Make file description 
  file_description <- paste0("Intention-to-treat estimates of the experimental and Section 8 vouchers on outcome; Outcome: ",
                             outcome_label, "; randomization inference to test for potential mediator; Age group at time of randomization: ", 
                             subgroup_label,
                             "; Regression includes pre-treatment covariates: ",
                             ifelse(covariates_baseline, "Yes", "No"),
                             "; Includes marginal means: ",
                             ifelse(marginal_means, "Yes", "No"))
  
  # Make memo file 
  memo_output <- paste0("FILE NUMBER: ", file_number, "\n",
                        "FILE NAME: output_disclosure_june_2022.xlsx, worksheet: ", output_file_name, "\n",
                        "FILE DESCRIPTION: ", file_description , "\n",
                        "RESEARCH OUTPUT PROGRAM: ", "data_analysis_disclosure_2.R", "\n",
                        "RESEARCH SAMPLE NUMBER: ", sample_name, "\n",
                        "DISCLOSURE STATISTICS FILE NAME: output_disclosure_sample_june_2022.xlsx, worksheet:", disclosure_sample_file_name, "\n",
                        "DISCLOSURE STATISTICS PROGRAM: ", "data_analysis_disclosure_2.R", "\n",
                        "COMMENT: ", "NA", "\n"
  )
  
  # Write memo to output directory
  write(memo_output, append = TRUE,
        file = paste0(output_file_dir, "memo/memo_information_june_2022.txt"))
}



# Generate the permutation results for Age < 13
perm_jail_age_less_13 <-
  lapply(1:5000, perm_regression, age_group_jail = "Age < 13")
save(perm_jail_age_less_13, file="/projects/data/analysis_data/ri_results_age_less_13.RData")
# Load the results
load("/projects/data/analysis_data/ri_results_age_less_13.RData")

# Generate the permutation results for Age < 13
perm_jail_age_13_18 <-
  lapply(1:5000, perm_regression, age_group_jail = "Age 13-18")
save(perm_jail_age_13_18, file="/projects/data/analysis_data/ri_results_age_13_18.RData")
# Load the results
load("/projects/data/analysis_data/ri_results_age_13_18.RData")

# Age < 13
ri_report(raw_perm=perm_jail_age_less_13)

# Age 13-18
ri_report(raw_perm = perm_jail_age_13_18,
          outcome = "reported_jail",
          outcome_label = "Self-reported incarceration",
          covariates_baseline = FALSE,
          subgroup = d$age_groups_ac == "Age 13-18" & 
            !is.na(d$reported_jail),
          subgroup_label = "Age 13-18",
          implicit_sample_desc = "children less than 13 years of age with self-reported incarceration data",
          explicit_sample_desc = "children 13 to 18 with self-reported incarceration data",
          marginal_means = FALSE,
          output_number = 12,
          sample_name = "4.C")


# Clean up some variables in the Final Roster dataset

# Race and ethnicity
d$race_ethn <- "Black"
d$race_ethn[d$f_svy_ethnic == 1] <- "Hispanic"
d$race_ethn[d$f_svy_race != 1 & d$f_svy_ethnic != 1] <- "Other"

# Head of household single
d$hh_single <- "Not single"
d$hh_single[d$FAMID %in% unique(d$FAMID[d$HHO6 == 1])] <- "Single"

# Head of household working
d$hh_working <- "No information"
d$hh_working[d$FAMID %in% unique(d$FAMID[d$HHO5 == 1])] <- "Working full-time"
d$hh_working[d$FAMID %in% unique(d$FAMID[d$HHO5 == 2])] <- "Working part-time"
d$hh_working[d$FAMID %in% unique(d$FAMID[d$HHO5 == 3])] <- "Not working"

# Head of household graduated high school
d$hh_high_school <- "No information"
d$hh_high_school[d$FAMID %in% unique(d$FAMID[d$HHO3 == 1])] <- "GED"
d$hh_high_school[d$FAMID %in% unique(d$FAMID[d$HHO3 == 2])] <- "Graduated high school"
d$hh_high_school[d$FAMID %in% unique(d$FAMID[d$HHO3 == 3])] <- "No GED or high school degree"

# Number of people in household
# Count the number of people in the household
hh_size <- d %>% group_by(FAMID) %>% dplyr::summarise(
  hh_size = n()
)
d$famid.x <- NULL
d$famid.y <- NULL

d <- merge(x = d, y = hh_size, by = "FAMID", all.x = TRUE)

# Categorize age
d$hh_size_c <- ""
d$hh_size_c[d$hh_size < 3] <- "1-2"
d$hh_size_c[d$hh_size == 3] <- "3"
d$hh_size_c[d$hh_size == 4] <- "4"
d$hh_size_c[d$hh_size == 5] <- "5"
d$hh_size_c[d$hh_size > 5] <- "6 or more"

# Make the sites
d$ra_site_c <- ""
d$ra_site_c[d$ra_site == 1] <- "Baltimore"
d$ra_site_c[d$ra_site == 2] <- "Boston"
d$ra_site_c[d$ra_site == 3] <- "Chicago"
d$ra_site_c[d$ra_site == 4] <- "LA"
d$ra_site_c[d$ra_site == 5] <- "NYC"

##################################################
# Make the proportion of people matched or not to voter file
# Site
p_ra_site <- d %>% group_by(subgroup = ra_site_c, exp_group) %>% 
  dplyr::summarise(variable = "Site",
                   prop_in_voter_file = mean(as.numeric(in_voter_file)),
                   n = n())

# Age group
p_age_group <- d %>% group_by(subgroup = age_groups_ac, exp_group) %>% 
  dplyr::summarise(variable = "Age group",
                   prop_in_voter_file = mean(as.numeric(in_voter_file)),
                   n = n())

# Gender
p_gender <- d %>% group_by(subgroup = f_svy_gender, exp_group) %>% 
  dplyr::summarise(variable = "Gender",
                   prop_in_voter_file = mean(as.numeric(in_voter_file)),
                   n = n())

# Race/ethnicity
p_race_ethn <- d %>% group_by(subgroup = race_ethn, exp_group) %>% 
  dplyr::summarise(variable = "Race/ethnicity",
                   prop_in_voter_file = mean(as.numeric(in_voter_file)),
                   n = n())

# Head of household never married
p_hh_single <- d %>% group_by(subgroup = hh_single, exp_group) %>% 
  dplyr::summarise(variable = "HH never married",
                   prop_in_voter_file = mean(as.numeric(in_voter_file)),
                   n = n())

# Head of household working
p_hh_working <- d %>% group_by(subgroup = hh_working, exp_group) %>% 
  dplyr::summarise(variable = "HH work status",
                   prop_in_voter_file = mean(as.numeric(in_voter_file)),
                   n = n())

# Head of household graduated high school
p_hh_high_school <- d %>% group_by(subgroup = hh_high_school, exp_group) %>% 
  dplyr::summarise(variable = "HH education status",
                   prop_in_voter_file = mean(as.numeric(in_voter_file)),
                   n = n())

# Household size
p_hh_size <- d %>% group_by(subgroup = hh_size_c, exp_group) %>% 
  dplyr::summarise(variable = "Household size",
                   prop_in_voter_file = mean(as.numeric(in_voter_file)),
                   n = n())

# Combine into one table
p_output <- rbind(p_ra_site, p_age_group, p_gender, p_race_ethn,
                  p_hh_single, p_hh_working, p_hh_high_school, p_hh_size)
p_output$n_in_voter_file = p_output$prop_in_voter_file*p_output$n
p_output$n_not_in_voter_file = (1-p_output$prop_in_voter_file)*p_output$n
p_output$prop_in_voter_file <- ifelse(p_output$n > 1000,
  signif(p_output$prop_in_voter_file, digits = 3),
  signif(p_output$prop_in_voter_file, digits = 2))
p_output$n_sample <- p_output$n
p_output$n <- unlist(lapply(FUN = count_round, X = p_output$n))
write.csv(x = p_output, file = "/projects/data/rr_disclosure/prop_in_voter_file_demo.csv", row.names = FALSE)

# Output regression results
compare_demo <- lm_robust(in_voter_file ~ ra_site_c + age_groups_ac +
                            f_svy_gender + race_ethn + hh_single + hh_working +
                            hh_high_school + hh_size, data = d, clusters = FAMID)
write.csv(data.frame(Variable = names(compare_demo$coefficients),
                     Coef = signif(compare_demo$coefficients, 4),
                     SE = signif(compare_demo$std.error, 4),
                     p_value = signif(compare_demo$p.value, 4)),
          file = "/projects/data/rr_disclosure/regression_table_predicting_in_voter_file.csv",
          row.names = FALSE)

# Count round
count_round(compare_demo$nobs)
count_round(compare_demo$nclusters)

# Merge the car and license data
d <- merge(x = d, y = mto_baseline, by = "FAMID", all.x = TRUE)


##################################################
# PVS Metadata Analysis 

# Read in number of observations in each state
obs <- read.csv("/projects/data/l2_obs_count.csv")
# 
# Check by going through each state chunk by chunk
chunk_check <- function(state, chunk_size = 50000) {
  # Number of observations per state
  state_obs <- obs$obs[obs$state == state]
  # Number of chunks set
  n_chunk <- ceiling(state_obs/chunk_size)
  # Print number of chunks
  print(paste0(state, ": number of chunks is ", n_chunk))
  # Data frame to share the results
  pik_dat <- data.frame(chunk = 1:n_chunk,
                        no_pik = NA,
                        yes_pik = NA,
                        with_mafid = NA,
                        verflag_A = NA,
                        verflag_D = NA,
                        verflag_M = NA,
                        verflag_S = NA,
                        verflag_T = NA)
  # Load in the data chunk by chunk
  for (i in 1:n_chunk) {
    check_state <- read_sas(data_file = paste0("[redacted]/uchicago_l2_2020_", state, "_pvs.sas7bdat"),
                            skip = (i-1)*chunk_size, n_max = chunk_size, col_select = c("pik", "mafid", "pvs_verflg"))
    pik_dat$no_pik[i] <- sum(check_state$pik == "")
    pik_dat$yes_pik[i] <- sum(check_state$pik != "")
    pik_dat$with_mafid[i] <- sum(!is.na(check_state$mafid))
    pik_dat$verflag_A[i] <- sum(check_state$pvs_verflg == "A")
    pik_dat$verflag_D[i] <- sum(check_state$pvs_verflg == "D")
    pik_dat$verflag_M[i] <- sum(check_state$pvs_verflg == "M")
    pik_dat$verflag_S[i] <- sum(check_state$pvs_verflg == "S")
    pik_dat$verflag_T[i] <- sum(check_state$pvs_verflg == "T")
    print(paste0("Chunk ", i))
  }
  pik_dat$state <- state
  # Output the data
  write.csv(pik_dat, file = paste0("/projects/data/l2_check/", state, "_l2_check.csv"), row.names = FALSE, append = FALSE)
  # Print that state is done
  print(paste0(state, ": check completed"))
}

# Loop through the states
for (s in obs$state) {
  chunk_check(state = s)
}

read_state_pvs <- function(state) {
  read.csv(paste0("/projects/data/l2_check/", state, "_l2_check.csv"))
}

# Read in all the state results
pvs_states <- do.call(rbind, lapply(X = obs$state, FUN = read_state_pvs))


pvs_states <- pvs_states %>% group_by(state) %>% summarise(
  no_pik = sum(no_pik),
  yes_pik = sum(yes_pik),
  with_mafid = sum(with_mafid),
  verflag_A = sum(verflag_A),
  verflag_D = sum(verflag_D),
  verflag_M = sum(verflag_M),
  verflag_S = sum(verflag_S),
  verflag_T = sum(verflag_T)
)

# Find total N
pvs_states$total_N <- pvs_states$no_pik + pvs_states$yes_pik
# write.csv(pvs_states, "/projects/disclosure/july_2023_request/output_table/l2_pvs_check_sample.csv", row.names = FALSE)


# Make everything 4 significant digits
pvs_output <- pvs_states[,3:9]/pvs_states$total_N
pvs_output <- signif(rbind(pvs_output, colSums(pvs_states[,3:9])/sum(pvs_states$total_N)), 4)
pvs_output$state <- c(obs$state, "us")
pvs_output$N <- c(pvs_states$total_N, sum(pvs_states$total_N))
pvs_output$N <- unlist(lapply(X = pvs_output$N, FUN = count_round))

# Output the results
# write.csv(pvs_output, "/projects/disclosure/july_2023_request/output_table/l2_pvs_check.csv", row.names = FALSE)

# Check MTO PVS

# Load MTO datasets
# ID dataset
mto_id <- read_sas("[redacted]/mto1994_2010.sas7bdat")
mto_id$pik_missing_dup <- mto_id$pik == "" | duplicated(mto_id$pik)
mean(mto_id$pik_missing_dup)
mto_id_check_raw <- table(mto_id$verflg)

mto_id_check <- signif(c(table(mto_id$verflg)/nrow(mto_id), with_pik = mean(!mto_id$pik_missing_dup)), 4)
mto_id_check <- data.frame(t(mto_id_check), N = count_round(nrow(mto_id)))
names(mto_id_check)[1:6] <- paste0("verflag_", names(mto_id_check)[1:6])
# write.csv(mto_id_check, "/projects/disclosure/july_2023_request/output_table/mto_pvs_check.csv", row.names = FALSE)



##########################################################
# Whether the head of household has driver's license

# Adults

lapply(1:9, itt_analysis_function,
       analysis_subgroup = d$f_svy_pers_baseline == "1" & d$has_license,
       analysis_subgroup_label = "head of household; head of household had driver's license at baseline",
       subgroup_file_label = "1_A_hh_license",
       sample_name = "1.A.hh.license",
       explicit_sample_desc = "adult head of household had driver's license at baseline",
       implicit_sample_desc = "non-adult, head of household did not have driver's license at baseline",
       marginal_means = FALSE)

lapply(1:9, itt_analysis_function,
       analysis_subgroup = d$f_svy_pers_baseline == "1" & !d$has_license,
       analysis_subgroup_label = "head of household; head of household did not have driver's license at baseline",
       subgroup_file_label = "1_A_hh_no_license", 
       sample_name = "1.A.hh.no.license",
       explicit_sample_desc = "adult head of household did not have driver's license at baseline",
       implicit_sample_desc = "non-adult, head of household had driver's license at baseline",
       marginal_means = FALSE)

##########################################################
# Whether the head of household has car

# Adults
lapply(1:9, itt_analysis_function,
       analysis_subgroup = d$f_svy_pers_baseline == "1" & d$has_car,
       analysis_subgroup_label = "head of household; head of household had car baseline",
       subgroup_file_label = "1_A_hh_car",
       sample_name = "1.A.hh.car",
       explicit_sample_desc = "adult head of household had car at baseline",
       implicit_sample_desc = "non-adult, head of household did not have car at baseline",
       marginal_means = FALSE)
lapply(1:9, itt_analysis_function,
       analysis_subgroup = d$f_svy_pers_baseline == "1" & !d$has_car,
       analysis_subgroup_label = "head of household; head of household did not have car baseline",
       subgroup_file_label = "1_A_hh_no_car",
       sample_name = "1.A.hh.no.car",
       explicit_sample_desc = "adult head of household did not have car at baseline",
       implicit_sample_desc = "non-adult, head of household had car at baseline",
       marginal_means = FALSE)

################################################
# In voter files

# Effects on only those in voter files
# Adults
lapply(2:9, itt_analysis_function,
       analysis_subgroup = d$age_groups_ac == "Adult" & d$in_voter_file,
       analysis_subgroup_label = "Adult in voter file",
       subgroup_file_label = "1_A_L2",
       explicit_sample_desc = "adult in voter file",
       sample_name = "1.A.L2",
       implicit_sample_desc = "non-adult, not in voter file",
       marginal_means = FALSE)

# Look up the implicit sample counts
not_l2_adult <- which(d$age_groups_ac != "Adult" | !d$in_voter_file)
table(d$exp_group[not_l2_adult])
table(d$exp_group[not_l2_adult][!duplicated(d$FAMID[not_l2_adult])])

not_l2_l13 <- which(d$age_groups_ac != "Age < 13" | !d$in_voter_file)
table(d$exp_group[not_l2_l13])
table(d$exp_group[not_l2_l13][!duplicated(d$FAMID[not_l2_l13])])

not_l2_13_18 <- which(d$age_groups_ac != "Age 13-18" | !d$in_voter_file)
table(d$exp_group[not_l2_13_18])
table(d$exp_group[not_l2_13_18][!duplicated(d$FAMID[not_l2_13_18])])

# Age < 13
# ITT
for (i in 2:9) {
  print(primary_analysis$outcome_label[i])
  # Subgroup analysis: need to figure out who is eligible to vote in what year 
  # d  
  if (!is.na(primary_analysis$election_year[i])) {
    election_year <- primary_analysis$election_year[i]
    subgroup_analysis_d <- d$age_groups_ac == "Age < 13" & d[,paste0("canvote_", election_year)]
    age_groups_not_eligible_d <- d$age_groups_ac == "Age < 13" & d[,paste0("canvote_", election_year)] == FALSE
    # generate the eligibility n and cluster
    my_vote_eligibility_d_n <- sum(age_groups_not_eligible_d)
    my_vote_eligibility_d_cluster <- length(unique(d$FAMID[age_groups_not_eligible_d]))
  } else {
    subgroup_analysis_d <- d$age_groups_ac == "Age < 13"
    my_vote_eligibility_d_n <- NULL
    my_vote_eligibility_d_cluster <- NULL
  }
  outcome_variable <- primary_analysis$outcome[i]
  outcome_y <- d[,outcome_variable]
  # Without baseline covariates 
  if (sum(subgroup_analysis_d) > [redacted] & sum(outcome_y[subgroup_analysis_d]) > 3) {
    # ITT
    itt_analysis_function(variable_number = i, 
                          include_N = TRUE, 
                          sample_name = paste0(primary_analysis$child_less_13_fr[i], ".L2"),
                          covariates_baseline = FALSE,
                          analysis_subgroup = subgroup_analysis_d & d$in_voter_file,
                          analysis_subgroup_label = "Age < 13 in voter file",
                          my_data = d, 
                          weights = d$f_wt_totcore98, 
                          marginal_means = FALSE,
                          clusters = d$FAMID,
                          explicit_sample_desc = "children less than 13 years of age; in voter file",
                          implicit_sample_desc = "adults; children 13 to 18; not in voter file",
                          vote_eligibility_n = my_vote_eligibility_d_n,
                          vote_eligibility_cluster = my_vote_eligibility_d_cluster)
  }
}

# Run the regressions: Age 13 to 18
for (i in 2:9) {
  print(primary_analysis$outcome_label[i])
  # Subgroup analysis: need to figure out who is eligible to vote in what year
  # d  
  if (!is.na(primary_analysis$election_year[i])) {
    election_year <- primary_analysis$election_year[i]
    subgroup_analysis_d <- d$age_groups_ac == "Age 13-18" & d[,paste0("canvote_", election_year)]
    age_groups_not_eligible_d <- d$age_groups_ac == "Age 13-18" & d[,paste0("canvote_", election_year)] == FALSE
    # generate the eligibility n and cluster
    my_vote_eligibility_d_n <- sum(age_groups_not_eligible_d)
    my_vote_eligibility_d_cluster <- length(unique(d$FAMID[age_groups_not_eligible_d]))
  } else {
    subgroup_analysis_d <- d$age_groups_ac == "Age 13-18"
    my_vote_eligibility_d_n <- NULL
    my_vote_eligibility_d_cluster <- NULL
  }
  
  outcome_variable <- primary_analysis$outcome[i]
  outcome_y <- d[,outcome_variable]
  # Without baseline covariates 
  if (sum(subgroup_analysis_d) > [redacted] & sum(outcome_y[subgroup_analysis_d]) > 3) {
    # ITT
    itt_analysis_function(variable_number = i, 
                          include_N = TRUE, 
                          sample_name = paste0(primary_analysis$child_13_18_fr[i], ".L2"),
                          covariates_baseline = FALSE,
                          analysis_subgroup = subgroup_analysis_d & d$in_voter_file,
                          analysis_subgroup_label = "Age 13-18",
                          my_data = d, 
                          weights = d$f_wt_totcore98, 
                          marginal_means = FALSE,
                          clusters = d$FAMID,
                          explicit_sample_desc = "children 13 to 18; in voter file",
                          implicit_sample_desc = "adults; children less than 13 years of age; not in voter file",
                          vote_eligibility_n = my_vote_eligibility_d_n,
                          vote_eligibility_cluster = my_vote_eligibility_d_cluster)
  }
  
}

# Effects on only those with 4 or more people in their household
# Adults
lapply(c(1, 7:9), itt_analysis_function,
       sample_name = "1.A.4more",
       analysis_subgroup = d$age_groups_ac == "Adult" & d$hh_size > 3,
       analysis_subgroup_label = "Adult in households of 4 or greater",
       subgroup_file_label = "1_A_4more",
       explicit_sample_desc = "adult, household size 4 or greater",
       implicit_sample_desc = "non-adult, household size less than 4",
       marginal_means = FALSE)

# Look up the implicit sample counts
# Adult
lesshh_adult <- which(d$age_groups_ac != "Adult" | d$hh_size <= 3)
table(d$exp_group[lesshh_adult])
table(d$exp_group[lesshh_adult][!duplicated(d$FAMID[lesshh_adult])])
# in_voter_file
table(d$exp_group[lesshh_adult], d$in_voter_file[lesshh_adult])
table(d$exp_group[lesshh_adult][!duplicated(d$FAMID[lesshh_adult])],
      d$in_voter_file[lesshh_adult][!duplicated(d$FAMID[lesshh_adult])])

# Age < 13
lesshh_l13 <- which(d$age_groups_ac != "Age < 13" | d$hh_size <= 3)
table(d$exp_group[lesshh_l13])
table(d$exp_group[lesshh_l13][!duplicated(d$FAMID[lesshh_l13])])
# in_voter_file
table(d$exp_group[lesshh_l13], d$in_voter_file[lesshh_l13])
table(d$exp_group[lesshh_l13][!duplicated(d$FAMID[lesshh_l13])],
      d$in_voter_file[lesshh_l13][!duplicated(d$FAMID[lesshh_l13])])

# Age 13-18
lesshh_13_18 <- which(d$age_groups_ac != "Age 13-18" | d$hh_size <= 3)
table(d$exp_group[lesshh_13_18])
table(d$exp_group[lesshh_13_18][!duplicated(d$FAMID[lesshh_13_18])])
# in_voter_file
table(d$exp_group[lesshh_13_18], d$in_voter_file[lesshh_13_18])
table(d$exp_group[lesshh_13_18][!duplicated(d$FAMID[lesshh_13_18])],
      d$in_voter_file[lesshh_13_18][!duplicated(d$FAMID[lesshh_13_18])])

# Age < 13
# ITT
for (i in c(1, 7:9)) {
  print(primary_analysis$outcome_label[i])
  # Subgroup analysis: need to figure out who is eligible to vote in what year 
  # d  
  if (!is.na(primary_analysis$election_year[i])) {
    election_year <- primary_analysis$election_year[i]
    subgroup_analysis_d <- d$age_groups_ac == "Age < 13" & d[,paste0("canvote_", election_year)]
    age_groups_not_eligible_d <- d$age_groups_ac == "Age < 13" & d[,paste0("canvote_", election_year)] == FALSE
    # generate the eligibility n and cluster
    my_vote_eligibility_d_n <- sum(age_groups_not_eligible_d)
    my_vote_eligibility_d_cluster <- length(unique(d$FAMID[age_groups_not_eligible_d]))
  } else {
    subgroup_analysis_d <- d$age_groups_ac == "Age < 13"
    my_vote_eligibility_d_n <- NULL
    my_vote_eligibility_d_cluster <- NULL
  }
  outcome_variable <- primary_analysis$outcome[i]
  outcome_y <- d[,outcome_variable]
  # Without baseline covariates 
  if (sum(subgroup_analysis_d) > [redacted] & sum(outcome_y[subgroup_analysis_d]) > 3) {
    # ITT
    itt_analysis_function(variable_number = i, 
                          include_N = TRUE, 
                          sample_name = "B.4more",
                          covariates_baseline = FALSE,
                          analysis_subgroup = subgroup_analysis_d & d$hh_size > 3,
                          analysis_subgroup_label = "Age < 13 in household size 4 or greater",
                          my_data = d, 
                          weights = d$f_wt_totcore98, 
                          marginal_means = FALSE,
                          clusters = d$FAMID,
                          explicit_sample_desc = "children less than 13 years of age; household size 4 or greater",
                          implicit_sample_desc = "adults; children 13 to 18; household size less than 4",
                          vote_eligibility_n = my_vote_eligibility_d_n,
                          vote_eligibility_cluster = my_vote_eligibility_d_cluster)
  }
}

# Run the regressions: Age 13 to 18
for (i in c(1, 7:9)) {
  print(primary_analysis$outcome_label[i])
  # Subgroup analysis: need to figure out who is eligible to vote in what year
  # d  
  if (!is.na(primary_analysis$election_year[i])) {
    election_year <- primary_analysis$election_year[i]
    subgroup_analysis_d <- d$age_groups_ac == "Age 13-18" & d[,paste0("canvote_", election_year)]
    age_groups_not_eligible_d <- d$age_groups_ac == "Age 13-18" & d[,paste0("canvote_", election_year)] == FALSE
    # generate the eligibility n and cluster
    my_vote_eligibility_d_n <- sum(age_groups_not_eligible_d)
    my_vote_eligibility_d_cluster <- length(unique(d$FAMID[age_groups_not_eligible_d]))
  } else {
    subgroup_analysis_d <- d$age_groups_ac == "Age 13-18"
    my_vote_eligibility_d_n <- NULL
    my_vote_eligibility_d_cluster <- NULL
  }
  
  outcome_variable <- primary_analysis$outcome[i]
  outcome_y <- d[,outcome_variable]
  # Without baseline covariates 
  if (sum(subgroup_analysis_d) > [redacted] & sum(outcome_y[subgroup_analysis_d]) > 3) {
    # ITT
    itt_analysis_function(variable_number = i, 
                          include_N = TRUE, 
                          sample_name = "C.4more",
                          covariates_baseline = FALSE,
                          analysis_subgroup = subgroup_analysis_d & d$hh_size > 3,
                          analysis_subgroup_label = "Age 13-18 in household size 4 or greater",
                          my_data = d, 
                          weights = d$f_wt_totcore98, 
                          marginal_means = FALSE,
                          clusters = d$FAMID,
                          explicit_sample_desc = "children 13 to 18; household size 4 or greater",
                          implicit_sample_desc = "adults; children less than 13 years of age; household size less than 4",
                          vote_eligibility_n = my_vote_eligibility_d_n,
                          vote_eligibility_cluster = my_vote_eligibility_d_cluster)
  }
  
}

################################################################
# Effects by site

site_itt_function <- function(site, not_age_group, age_group, age_group_letter) {
  lapply(c(1, 7, 8, 9), itt_analysis_function,
         covariates_baseline = FALSE,
         covariate_terms = NULL,
         sample_name = paste0("1.", age_group_letter, ".", site),
         analysis_subgroup = d$age_groups_ac == age_group & d$ra_site_c == site,
         analysis_subgroup_label = paste0(age_group, ", Site: ", site),
         my_data = d, 
         weights = d$f_wt_totcore98, 
         subgroup_file_label = tolower(paste0(age_group, "_", site)),
         marginal_means = FALSE,
         explicit_sample_desc = paste0(age_group, ", site: ", site),
         implicit_sample_desc = paste0(not_age_group, ", site: not ", site),
         clusters = d$FAMID, no_site = TRUE
  )
}

# By site
ra_site <- unique(d$ra_site_c)

# Adult: by site
lapply(ra_site, site_itt_function, age_group = "Adult",
       not_age_group = "non-adult", age_group_letter = "A")

# Children less than 13: by site
lapply(ra_site, site_itt_function, age_group = "Age < 13",
       not_age_group = "adults; children 13 to 18", age_group_letter = "B")

# Children 13-18: by site
lapply(ra_site, site_itt_function, age_group = "Age 13-18",
       not_age_group = "adults; children less than 13 years of age", age_group_letter = "C")

############################################
# Fix Hispanic Age 13-18 issue
# Clean up the Hispanic variable
d$hispanic <- ifelse(d$f_svy_ethnic == 1, 1, 0)
d$hispanic[is.na(d$hispanic)] <- mean(d$hispanic[!is.na(d$hispanic)])

# Age < 13
nothis_13_18 <- which(d$age_groups_ac != "Age 13-18" | d$hispanic != 1)
table(d$exp_group[nothis_13_18])
table(d$exp_group[nothis_13_18][!duplicated(d$FAMID[nothis_13_18])])
# in_voter_file
table(d$exp_group[nothis_13_18], d$in_voter_file[nothis_13_18])
table(d$exp_group[nothis_13_18][!duplicated(d$FAMID[nothis_13_18])],
      d$in_voter_file[nothis_13_18][!duplicated(d$FAMID[nothis_13_18])])


# Run the variables
lapply(1:9, itt_analysis_function,
       covariates_baseline = FALSE,
       covariate_terms = NULL,
       sample_name = "1.C.hispanic",
       analysis_subgroup = d$age_groups_ac == "Age 13-18" & d$hispanic == 1,
       analysis_subgroup_label = "Age 13-18, Ethnicity: Hispanic",
       my_data = d, 
       weights = d$f_wt_totcore98, 
       subgroup_file_label = "child_13_18_hispanic",
       marginal_means = FALSE,
       explicit_sample_desc = "children 13-18, Hispanic",
       implicit_sample_desc = "adult; children less than 13 years of age; children 13-18 non-Hispanic",
       clusters = d$FAMID, no_site = TRUE
)

################################################################
# Check that people in the experimental group did not move to
# less competitive states

# Clean the data
# Load the geo_data
mto_geo <- read_sas("[redacted]/mto_fin_addrhx.sas7bdat")
# Convert quarters to numbers
q_num <- expand.grid(year = 1994:2020, quarter = paste0("Q", 1:4))
q_num <- q_num[order(q_num$year),]
q_num_start <- data.frame(q_num_start = 1:nrow(q_num),
                          spell_start_yq = paste0(q_num$year, q_num$quarter))
q_num_cease <- data.frame(q_num_cease = 1:nrow(q_num),
                          spell_cease_yq = paste0(q_num$year, q_num$quarter))
# Convert quarters to numbers in geo data
mto_geo$spell_start_yq <- gsub(pattern = "        ", replacement = "",
                               mto_geo$spell_start_yq)
mto_geo$spell_cease_yq <- gsub(pattern = "        ", replacement = "",
                               mto_geo$spell_cease_yq)
# Merge in the new numerical variables
mto_geo <- merge(x = mto_geo,
                 y = q_num_start,
                 by = "spell_start_yq",
                 all.x = TRUE)
mto_geo <- merge(x = mto_geo,
                 y = q_num_cease,
                 by = "spell_cease_yq",
                 all.x = TRUE)


# Make function that extracts where people live during Q4 of election years
election_year_geo <- function(row_id,
                              election_quarters,
                              election_year) {
  ad_quarters <- mto_geo$q_num_start[row_id]:mto_geo$q_num_cease[row_id]
  if (sum(election_quarters %in% ad_quarters) == 2) {
    data.frame(ppid = mto_geo$ppid[row_id],
               state = mto_geo$fin_state[row_id])
  }
}

# Import data on election returns margins
e_margins <- read.csv("/projects/data/analysis_data/pres_election_margins.csv")

# Run the function above
# 2000
election2000 <- do.call(rbind, lapply(1:nrow(mto_geo), FUN = election_year_geo,
                                      election_quarters = c(27, 28),
                                      election_year = 2000))
names(election2000)[2] <- "state_2000"
e_margins_2000 <- e_margins[e_margins$year == 2000, c("state_ab", "margin")]
names(e_margins_2000) <- c("state_2000", "margin_2000")
e_margins_2000$state_2000 <- toupper(e_margins_2000$state_2000)
election2000 <- merge(x = election2000, y = e_margins_2000, by = "state_2000",
                      all.x = TRUE)

# 2004
election2004 <- do.call(rbind, lapply(1:nrow(mto_geo), FUN = election_year_geo,
                                      election_quarters = c(43, 44),
                                      election_year = 2004))
names(election2004)[2] <- "state_2004"
e_margins_2004 <- e_margins[e_margins$year == 2004, c("state_ab", "margin")]
names(e_margins_2004) <- c("state_2004", "margin_2004")
e_margins_2004$state_2004 <- toupper(e_margins_2004$state_2004)
election2004 <- merge(x = election2004, y = e_margins_2004, by = "state_2004",
                      all.x = TRUE)

# 2008
election2008 <- do.call(rbind, lapply(1:nrow(mto_geo), FUN = election_year_geo,
                                      election_quarters = c(59, 60),
                                      election_year = 2008))
names(election2008)[2] <- "state_2008"
e_margins_2008 <- e_margins[e_margins$year == 2008, c("state_ab", "margin")]
names(e_margins_2008) <- c("state_2008", "margin_2008")
e_margins_2008$state_2008 <- toupper(e_margins_2008$state_2008)
election2008 <- merge(x = election2008, y = e_margins_2008, by = "state_2008",
                      all.x = TRUE)

# Combine the election margins into one dataset
election_m <- merge(x = election2000,
                    y = election2004,
                    by = "ppid", all = TRUE)
election_m <- merge(x = election_m,
                    y = election2008,
                    by = "ppid", all = TRUE)

# Remove duplicated rows
election_m <- dplyr::distinct(election_m)
names(election_m)[1] <- "PPID"

# Merge the election margins data into d
d <- merge(x = d, y = election_m, all.x = TRUE, by = "PPID")

# Take the absolute value of the margins
d$margin_2000 <- abs(d$margin_2000)
d$margin_2004 <- abs(d$margin_2004)
d$margin_2008 <- abs(d$margin_2008)

# Run the analysis
# 2000 Presidential Election
itt_function(outcome = "margin_2000",
             outcome_label = "2020 Presidential Election Margins",
             covariate_terms = NULL,
             covariates_baseline = FALSE,
             subgroup = d$age_groups_ac == "Adult" & !is.na(d$margin_2000),
             subgroup_label = "Adult",
             my_data = d,
             weights = d$f_wt_totcore98,
             clusters = d$FAMID,
             include_N = TRUE,
             subgroup_file_label = "adult",
             marginal_means = FALSE,
             sample_name = "1.A",
             explicit_sample_desc = "adult",
             implicit_sample_desc = "non-adult",
             vote_eligibility_n = NULL,
             vote_eligibility_cluster = NULL,
             output_number = 10)

# 2004
itt_function(outcome = "margin_2004",
             outcome_label = "2004 Presidential Election Margins",
             covariate_terms = NULL,
             covariates_baseline = FALSE,
             subgroup = d$age_groups_ac == "Adult",
             subgroup_label = "Adult",
             my_data = d,
             weights = d$f_wt_totcore98,
             clusters = d$FAMID,
             include_N = TRUE,
             subgroup_file_label = "adult",
             marginal_means = FALSE,
             sample_name = "1.A",
             explicit_sample_desc = "adult",
             implicit_sample_desc = "non-adult",
             vote_eligibility_n = NULL,
             vote_eligibility_cluster = NULL,
             output_number = 11)

# 2008
itt_function(outcome = "margin_2008",
             outcome_label = "2008 Presidential Election Margins",
             covariate_terms = NULL,
             covariates_baseline = FALSE,
             subgroup = d$age_groups_ac == "Adult",
             subgroup_label = "Adult",
             my_data = d,
             weights = d$f_wt_totcore98,
             clusters = d$FAMID,
             include_N = TRUE,
             subgroup_file_label = "adult",
             marginal_means = FALSE,
             sample_name = "1.A",
             explicit_sample_desc = "adult",
             implicit_sample_desc = "non-adult",
             vote_eligibility_n = NULL,
             vote_eligibility_cluster = NULL,
             output_number = 12)




